diff options
352 files changed, 12642 insertions, 4529 deletions
@@ -673,6 +673,7 @@ Muchun Song <muchun.song@linux.dev> <smuchun@gmail.com> Ross Zwisler <zwisler@kernel.org> <ross.zwisler@linux.intel.com> Rudolf Marek <R.Marek@sh.cvut.cz> Rui Saraiva <rmps@joel.ist.utl.pt> +Sachin Mokashi <sachin.mokashi@intel.com> <sachinx.mokashi@intel.com> Sachin P Sant <ssant@in.ibm.com> Sai Prakash Ranjan <quic_saipraka@quicinc.com> <saiprakash.ranjan@codeaurora.org> Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi> diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index bf03263b9f46..bc0e7fefc39d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -861,3 +861,25 @@ Description: This is a read-only entry to show the value of sb.s_encoding_flags, SB_ENC_STRICT_MODE_FL 0x00000001 SB_ENC_NO_COMPAT_FALLBACK_FL 0x00000002 ============================ ========== + +What: /sys/fs/f2fs/<disk>/reserved_pin_section +Date: June 2025 +Contact: "Chao Yu" <chao@kernel.org> +Description: This threshold is used to control triggering garbage collection while + fallocating on pinned file, so, it can guarantee there is enough free + reserved section before preallocating on pinned file. + By default, the value is ovp_sections, especially, for zoned ufs, the + value is 1. + +What: /sys/fs/f2fs/<disk>/gc_boost_gc_multiple +Date: June 2025 +Contact: "Daeho Jeong" <daehojeong@google.com> +Description: Set a multiplier for the background GC migration window when F2FS GC is + boosted. The range should be from 1 to the segment count in a section. + Default: 5 + +What: /sys/fs/f2fs/<disk>/gc_boost_gc_greedy +Date: June 2025 +Contact: "Daeho Jeong" <daehojeong@google.com> +Description: Control GC algorithm for boost GC. 0: cost benefit, 1: greedy + Default: 1 diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 210c194d4a7b..8ccc5af5ea1e 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -131,3 +131,59 @@ Get IO accounting for pid 1, it works only with -p:: linuxrc: read=65536, write=0, cancelled_write=0 The above command can be used with -v to get more debug information. + +After the system starts, use `delaytop` to get the system-wide delay information, +which includes system-wide PSI information and Top-N high-latency tasks. + +`delaytop` supports sorting by CPU latency in descending order by default, +displays the top 20 high-latency tasks by default, and refreshes the latency +data every 2 seconds by default. + +Get PSI information and Top-N tasks delay, since system boot:: + + bash# ./delaytop + System Pressure Information: (avg10/avg60/avg300/total) + CPU some: 0.0%/ 0.0%/ 0.0%/ 345(ms) + CPU full: 0.0%/ 0.0%/ 0.0%/ 0(ms) + Memory full: 0.0%/ 0.0%/ 0.0%/ 0(ms) + Memory some: 0.0%/ 0.0%/ 0.0%/ 0(ms) + IO full: 0.0%/ 0.0%/ 0.0%/ 65(ms) + IO some: 0.0%/ 0.0%/ 0.0%/ 79(ms) + IRQ full: 0.0%/ 0.0%/ 0.0%/ 0(ms) + Top 20 processes (sorted by CPU delay): + PID TGID COMMAND CPU(ms) IO(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms) + ---------------------------------------------------------------------------------------------- + 161 161 zombie_memcg_re 1.40 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 130 130 blkcg_punt_bio 1.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 444 444 scsi_tmf_0 0.73 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 1280 1280 rsyslogd 0.53 0.04 0.00 0.00 0.00 0.00 0.00 0.00 + 12 12 ksoftirqd/0 0.47 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 1277 1277 nbd-server 0.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 308 308 kworker/2:2-sys 0.41 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 55 55 netns 0.36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 1187 1187 acpid 0.31 0.03 0.00 0.00 0.00 0.00 0.00 0.00 + 6184 6184 kworker/1:2-sys 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 186 186 kaluad 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 18 18 ksoftirqd/1 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 185 185 kmpath_rdacd 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 190 190 kstrp 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 2759 2759 agetty 0.20 0.03 0.00 0.00 0.00 0.00 0.00 0.00 + 1190 1190 kworker/0:3-sys 0.19 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 1272 1272 sshd 0.15 0.04 0.00 0.00 0.00 0.00 0.00 0.00 + 1156 1156 license 0.15 0.11 0.00 0.00 0.00 0.00 0.00 0.00 + 134 134 md 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 6142 6142 kworker/3:2-xfs 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + +Dynamic interactive interface of delaytop:: + + # ./delaytop -p pid + Print delayacct stats + + # ./delaytop -P num + Display the top N tasks + + # ./delaytop -n num + Set delaytop refresh frequency (num times) + + # ./delaytop -d secs + Specify refresh interval as secs diff --git a/Documentation/admin-guide/device-mapper/thin-provisioning.rst b/Documentation/admin-guide/device-mapper/thin-provisioning.rst index bafebf79da4b..b2fa49a5608a 100644 --- a/Documentation/admin-guide/device-mapper/thin-provisioning.rst +++ b/Documentation/admin-guide/device-mapper/thin-provisioning.rst @@ -80,11 +80,11 @@ less sharing than average you'll need a larger-than-average metadata device. As a guide, we suggest you calculate the number of bytes to use in the metadata device as 48 * $data_dev_size / $data_block_size but round it up -to 2MB if the answer is smaller. If you're creating large numbers of +to 2MiB if the answer is smaller. If you're creating large numbers of snapshots which are recording large amounts of change, you may find you need to increase this. -The largest size supported is 16GB: If the device is larger, +The largest size supported is 16GiB: If the device is larger, a warning will be issued and the excess space will not be used. Reloading a pool table @@ -107,13 +107,13 @@ Using an existing pool device $data_block_size gives the smallest unit of disk space that can be allocated at a time expressed in units of 512-byte sectors. -$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a -multiple of 128 (64KB). $data_block_size cannot be changed after the +$data_block_size must be between 128 (64KiB) and 2097152 (1GiB) and a +multiple of 128 (64KiB). $data_block_size cannot be changed after the thin-pool is created. People primarily interested in thin provisioning -may want to use a value such as 1024 (512KB). People doing lots of -snapshotting may want a smaller value such as 128 (64KB). If you are +may want to use a value such as 1024 (512KiB). People doing lots of +snapshotting may want a smaller value such as 128 (64KiB). If you are not zeroing newly-allocated data, a larger $data_block_size in the -region of 256000 (128MB) is suggested. +region of 262144 (128MiB) is suggested. $low_water_mark is expressed in blocks of size $data_block_size. If free space on the data device drops below this level then a dm event @@ -291,7 +291,7 @@ i) Constructor error_if_no_space: Error IOs, instead of queueing, if no space. - Data block size must be between 64KB (128 sectors) and 1GB + Data block size must be between 64KiB (128 sectors) and 1GiB (2097152 sectors) inclusive. diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 20fabdf6567e..9c6cd52f69cf 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -311,6 +311,27 @@ crashkernel syntax crashkernel=0,low +4) crashkernel=size,cma + + Reserve additional crash kernel memory from CMA. This reservation is + usable by the first system's userspace memory and kernel movable + allocations (memory balloon, zswap). Pages allocated from this memory + range will not be included in the vmcore so this should not be used if + dumping of userspace memory is intended and it has to be expected that + some movable kernel pages may be missing from the dump. + + A standard crashkernel reservation, as described above, is still needed + to hold the crash kernel and initrd. + + This option increases the risk of a kdump failure: DMA transfers + configured by the first kernel may end up corrupting the second + kernel's memory. + + This reservation method is intended for systems that can't afford to + sacrifice enough memory for standard crashkernel reservation and where + less reliable and possibly incomplete kdump is preferable to no kdump at + all. + Boot into System Kernel ----------------------- 1) Update the boot loader (such as grub, yaboot, or lilo) configuration diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8981ae1c9355..747a55abf494 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -994,6 +994,28 @@ 0: to disable low allocation. It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. + crashkernel=size[KMG],cma + [KNL, X86] Reserve additional crash kernel memory from + CMA. This reservation is usable by the first system's + userspace memory and kernel movable allocations (memory + balloon, zswap). Pages allocated from this memory range + will not be included in the vmcore so this should not + be used if dumping of userspace memory is intended and + it has to be expected that some movable kernel pages + may be missing from the dump. + + A standard crashkernel reservation, as described above, + is still needed to hold the crash kernel and initrd. + + This option increases the risk of a kdump failure: DMA + transfers configured by the first kernel may end up + corrupting the second kernel's memory. + + This reservation method is intended for systems that + can't afford to sacrifice enough memory for standard + crashkernel reservation and where less reliable and + possibly incomplete kdump is preferable to no kdump at + all. cryptomgr.notests [KNL] Disable crypto self-tests @@ -1806,6 +1828,27 @@ backtraces on all cpus. Format: 0 | 1 + hash_pointers= + [KNL,EARLY] + By default, when pointers are printed to the console + or buffers via the %p format string, that pointer is + "hashed", i.e. obscured by hashing the pointer value. + This is a security feature that hides actual kernel + addresses from unprivileged users, but it also makes + debugging the kernel more difficult since unequal + pointers can no longer be compared. The choices are: + Format: { auto | always | never } + Default: auto + + auto - Hash pointers unless slab_debug is enabled. + always - Always hash pointers (even if slab_debug is + enabled). + never - Never hash pointers. This option should only + be specified when debugging the kernel. Do + not use on production kernels. The boot + param "no_hash_pointers" is an alias for + this mode. + hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on for 64-bit NUMA, off otherwise. @@ -4194,18 +4237,7 @@ no_hash_pointers [KNL,EARLY] - Force pointers printed to the console or buffers to be - unhashed. By default, when a pointer is printed via %p - format string, that pointer is "hashed", i.e. obscured - by hashing the pointer value. This is a security feature - that hides actual kernel addresses from unprivileged - users, but it also makes debugging the kernel more - difficult since unequal pointers can no longer be - compared. However, if this command-line option is - specified, then all normal pointers will have their true - value printed. This option should only be specified when - debugging the kernel. Please do not use on production - kernels. + Alias for "hash_pointers=never". nohibernate [HIBERNATION] Disable hibernation and resume. @@ -4557,7 +4589,7 @@ bit 2: print timer info bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer - bit 5: print all printk messages in buffer + bit 5: replay all messages on consoles at the end of panic bit 6: print all CPUs backtrace (if available in the arch) bit 7: print only tasks in uninterruptible (blocked) state *Be aware* that this option may print a _lot_ of lines, @@ -4565,6 +4597,25 @@ Use this option carefully, maybe worth to setup a bigger log buffer with "log_buf_len" along with this. + panic_sys_info= A comma separated list of extra information to be dumped + on panic. + Format: val[,val...] + Where @val can be any of the following: + + tasks: print all tasks info + mem: print system memory info + timers: print timers info + locks: print locks info if CONFIG_LOCKDEP is on + ftrace: print ftrace buffer + all_bt: print all CPUs backtrace (if available in the arch) + blocked_tasks: print only tasks in uninterruptible (blocked) state + + This is a human readable alternative to the 'panic_print' option. + + panic_console_replay + When panic happens, replay all kernel messages on + consoles at the end of panic. + parkbd.port= [HW] Parallel port number the keyboard adapter is connected to, default is 0. Format: <parport#> @@ -6603,6 +6654,10 @@ Documentation/admin-guide/mm/slab.rst. (slub_debug legacy name also accepted for now) + Using this option implies the "no_hash_pointers" + option which can be undone by adding the + "hash_pointers=always" option. + slab_max_order= [MM] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory @@ -7032,6 +7087,11 @@ consumed by the stack hash table. By default this is set to false. + stack_depot_max_pools= [KNL,EARLY] + Specify the maximum number of pools to use for storing + stack traces. Pools are allocated on-demand up to this + limit. Default value is 8191 pools. + stacktrace [FTRACE] Enabled the stack tracer on boot up. diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3c8faad03d01..8b49eab937d0 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -890,7 +890,7 @@ bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer -bit 5 print all printk messages in buffer +bit 5 replay all messages on consoles at the end of panic bit 6 print all CPUs backtrace (if available in the arch) bit 7 print only tasks in uninterruptible (blocked) state ===== ============================================ @@ -900,6 +900,24 @@ So for example to print tasks and memory info on panic, user can:: echo 3 > /proc/sys/kernel/panic_print +panic_sys_info +============== + +A comma separated list of extra information to be dumped on panic, +for example, "tasks,mem,timers,...". It is a human readable alternative +to 'panic_print'. Possible values are: + +============= =================================================== +tasks print all tasks info +mem print system memory info +timer print timers info +lock print locks info if CONFIG_LOCKDEP is on +ftrace print ftrace buffer +all_bt print all CPUs backtrace (if available in the arch) +blocked_tasks print only tasks in uninterruptible (blocked) state +============= =================================================== + + panic_on_rcu_stall ================== diff --git a/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml b/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml new file mode 100644 index 000000000000..fe2e9633c46f --- /dev/null +++ b/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml @@ -0,0 +1,179 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i3c/renesas,i3c.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/G3S and RZ/G3E I3C Bus Interface + +maintainers: + - Wolfram Sang <wsa+renesas@sang-engineering.com> + - Tommaso Merciai <tommaso.merciai.xr@bp.renesas.com> + +properties: + compatible: + items: + - enum: + - renesas,r9a08g045-i3c # RZ/G3S + - renesas,r9a09g047-i3c # RZ/G3E + + reg: + maxItems: 1 + + interrupts: + items: + - description: Non-recoverable internal error interrupt + - description: Normal transfer error interrupt + - description: Normal transfer abort interrupt + - description: Normal response status buffer full interrupt + - description: Normal command buffer empty interrupt + - description: Normal IBI status buffer full interrupt + - description: Normal Rx data buffer full interrupt + - description: Normal Tx data buffer empty interrupt + - description: Normal receive status buffer full interrupt + - description: START condition detection interrupt + - description: STOP condition detection interrupt + - description: Transmit end interrupt + - description: NACK detection interrupt + - description: Arbitration lost interrupt + - description: Timeout detection interrupt + - description: Wake-up condition detection interrupt + - description: HDR Exit Pattern detection interrupt + minItems: 16 + + interrupt-names: + items: + - const: ierr + - const: terr + - const: abort + - const: resp + - const: cmd + - const: ibi + - const: rx + - const: tx + - const: rcv + - const: st + - const: sp + - const: tend + - const: nack + - const: al + - const: tmo + - const: wu + - const: exit + minItems: 16 + + clocks: + items: + - description: APB bus clock + - description: transfer clock + - description: SFRs clock + minItems: 2 + + clock-names: + items: + - const: pclk + - const: tclk + - const: pclkrw + minItems: 2 + + power-domains: + maxItems: 1 + + resets: + items: + - description: Reset signal + - description: APB interface reset signal/SCAN reset signal + + reset-names: + items: + - const: presetn + - const: tresetn + +required: + - compatible + - reg + - interrupts + - interrupt-names + - clock-names + - clocks + - power-domains + - resets + - reset-names + +allOf: + - $ref: i3c.yaml# + + - if: + properties: + compatible: + contains: + const: renesas,r9a08g045-i3c + then: + properties: + clocks: + maxItems: 2 + clock-names: + maxItems: 2 + interrupts: + minItems: 17 + interrupt-names: + minItems: 17 + + - if: + properties: + compatible: + contains: + const: renesas,r9a09g047-i3c + then: + properties: + clocks: + minItems: 3 + clock-names: + minItems: 3 + interrupts: + maxItems: 16 + interrupt-names: + maxItems: 16 + +unevaluatedProperties: false + +examples: + - | + #include <dt-bindings/clock/r9a08g045-cpg.h> + #include <dt-bindings/interrupt-controller/arm-gic.h> + + i3c@1005b000 { + compatible = "renesas,r9a08g045-i3c"; + reg = <0x1005b000 0x1000>; + clocks = <&cpg CPG_MOD R9A08G045_I3C_PCLK>, + <&cpg CPG_MOD R9A08G045_I3C_TCLK>; + clock-names = "pclk", "tclk"; + interrupts = <GIC_SPI 289 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 293 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 294 IRQ_TYPE_EDGE_RISING>, + <GIC_SPI 295 IRQ_TYPE_EDGE_RISING>, + <GIC_SPI 296 IRQ_TYPE_EDGE_RISING>, + <GIC_SPI 297 IRQ_TYPE_EDGE_RISING>, + <GIC_SPI 298 IRQ_TYPE_EDGE_RISING>, + <GIC_SPI 299 IRQ_TYPE_EDGE_RISING>, + <GIC_SPI 304 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 305 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 307 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 308 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 309 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 310 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 311 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 306 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "ierr", "terr", "abort", "resp", + "cmd", "ibi", "rx", "tx", "rcv", + "st", "sp", "tend", "nack", + "al", "tmo", "wu", "exit"; + resets = <&cpg R9A08G045_I3C_PRESETN>, + <&cpg R9A08G045_I3C_TRESETN>; + reset-names = "presetn", "tresetn"; + power-domains = <&cpg>; + #address-cells = <3>; + #size-cells = <0>; + }; +... diff --git a/Documentation/devicetree/bindings/rtc/amlogic,a4-rtc.yaml b/Documentation/devicetree/bindings/rtc/amlogic,a4-rtc.yaml index 5d3ac737abcb..e61f22eca85b 100644 --- a/Documentation/devicetree/bindings/rtc/amlogic,a4-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/amlogic,a4-rtc.yaml @@ -16,9 +16,14 @@ allOf: properties: compatible: - enum: - - amlogic,a4-rtc - - amlogic,a5-rtc + oneOf: + - enum: + - amlogic,a4-rtc + - amlogic,a5-rtc + - items: + - enum: + - amlogic,c3-rtc + - const: amlogic,a5-rtc reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/rtc/nxp,lpc1788-rtc.yaml b/Documentation/devicetree/bindings/rtc/nxp,lpc1788-rtc.yaml index e88b847a1cc5..e896ba59302a 100644 --- a/Documentation/devicetree/bindings/rtc/nxp,lpc1788-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/nxp,lpc1788-rtc.yaml @@ -18,7 +18,12 @@ allOf: properties: compatible: - const: nxp,lpc1788-rtc + oneOf: + - items: + - enum: + - nxp,lpc1850-rtc + - const: nxp,lpc1788-rtc + - const: nxp,lpc1788-rtc reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/rtc/nxp,lpc3220-rtc.yaml b/Documentation/devicetree/bindings/rtc/nxp,lpc3220-rtc.yaml new file mode 100644 index 000000000000..53353de4cb37 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/nxp,lpc3220-rtc.yaml @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/nxp,lpc3220-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP LPC32xx SoC Real-time Clock + +maintainers: + - Frank Li <Frank.Li@nxp.com> + +properties: + compatible: + enum: + - nxp,lpc3220-rtc + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + interrupts: + maxItems: 1 + + start-year: true + +required: + - compatible + - reg + +allOf: + - $ref: rtc.yaml# + +unevaluatedProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/irq.h> + #include <dt-bindings/clock/lpc32xx-clock.h> + + rtc@40024000 { + compatible = "nxp,lpc3220-rtc"; + reg = <0x40024000 0x1000>; + interrupt-parent = <&sic1>; + interrupts = <20 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LPC32XX_CLK_RTC>; + }; + diff --git a/Documentation/devicetree/bindings/rtc/nxp,pcf85063.yaml b/Documentation/devicetree/bindings/rtc/nxp,pcf85063.yaml index 2f892f8640d1..1e6277e524c2 100644 --- a/Documentation/devicetree/bindings/rtc/nxp,pcf85063.yaml +++ b/Documentation/devicetree/bindings/rtc/nxp,pcf85063.yaml @@ -12,6 +12,7 @@ maintainers: properties: compatible: enum: + - microcrystal,rv8063 - microcrystal,rv8263 - nxp,pcf85063 - nxp,pcf85063a @@ -44,13 +45,19 @@ properties: wakeup-source: true + spi-cs-high: true + + spi-3wire: true + allOf: + - $ref: /schemas/spi/spi-peripheral-props.yaml# - $ref: rtc.yaml# - if: properties: compatible: contains: enum: + - microcrystal,rv8063 - microcrystal,rv8263 then: properties: @@ -65,12 +72,23 @@ allOf: properties: quartz-load-femtofarads: const: 7000 + - if: + properties: + compatible: + not: + contains: + enum: + - microcrystal,rv8063 + then: + properties: + spi-cs-high: false + spi-3wire: false required: - compatible - reg -additionalProperties: false +unevaluatedProperties: false examples: - | @@ -90,3 +108,16 @@ examples: }; }; }; + + - | + spi { + #address-cells = <1>; + #size-cells = <0>; + + rtc@0 { + compatible = "microcrystal,rv8063"; + reg = <0>; + spi-cs-high; + spi-3wire; + }; + }; diff --git a/Documentation/devicetree/bindings/soc/sophgo/sophgo,cv1800b-rtc.yaml b/Documentation/devicetree/bindings/rtc/sophgo,cv1800b-rtc.yaml index 5cf186c396c9..c695d2ff9fcc 100644 --- a/Documentation/devicetree/bindings/soc/sophgo/sophgo,cv1800b-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/sophgo,cv1800b-rtc.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/sophgo/sophgo,cv1800b-rtc.yaml# +$id: http://devicetree.org/schemas/rtc/sophgo,cv1800b-rtc.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Real Time Clock of the Sophgo CV1800 SoC diff --git a/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml index 7330a7200831..5e0c7cd25cc6 100644 --- a/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml @@ -63,8 +63,6 @@ properties: - microcrystal,rv3029 # Real Time Clock - microcrystal,rv8523 - # NXP LPC32xx SoC Real-time Clock - - nxp,lpc3220-rtc # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC - ricoh,r2025sd # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 8eeb7ea14f61..e5bb89452aff 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -238,9 +238,9 @@ usrjquota=<file> Appoint specified file and type during mount, so that quota grpjquota=<file> information can be properly updated during recovery flow, prjjquota=<file> <quota file>: must be in root directory; jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1]. -offusrjquota Turn off user journalled quota. -offgrpjquota Turn off group journalled quota. -offprjjquota Turn off project journalled quota. +usrjquota= Turn off user journalled quota. +grpjquota= Turn off group journalled quota. +prjjquota= Turn off project journalled quota. quota Enable plain user disk quota accounting. noquota Disable all plain disk quota option. alloc_mode=%s Adjust block allocation policy, which supports "reuse" diff --git a/MAINTAINERS b/MAINTAINERS index 7375f9ed8408..1b57dd4fcf01 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6713,7 +6713,7 @@ S: Supported F: drivers/input/keyboard/dlink-dir685-touchkeys.c DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK -M: Joshua Kinard <kumba@gentoo.org> +M: Joshua Kinard <linux@kumba.dev> S: Maintained F: drivers/rtc/rtc-ds1685.c F: include/linux/rtc/ds1685.h @@ -11612,6 +11612,13 @@ S: Maintained F: Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml F: drivers/i3c/master/i3c-master-cdns.c +I3C DRIVER FOR RENESAS +M: Wolfram Sang <wsa+renesas@sang-engineering.com> +M: Tommaso Merciai <tommaso.merciai.xr@bp.renesas.com> +S: Supported +F: Documentation/devicetree/bindings/i3c/renesas,i3c.yaml +F: drivers/i3c/master/renesas-i3c.c + I3C DRIVER FOR SYNOPSYS DESIGNWARE S: Orphan F: Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml @@ -11622,6 +11629,7 @@ M: Alexandre Belloni <alexandre.belloni@bootlin.com> R: Frank Li <Frank.Li@nxp.com> L: linux-i3c@lists.infradead.org (moderated for non-subscribers) S: Maintained +Q: https://patchwork.kernel.org/project/linux-i3c/list/ C: irc://chat.freenode.net/linux-i3c T: git git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git F: Documentation/ABI/testing/sysfs-bus-i3c @@ -13536,6 +13544,7 @@ F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h F: kernel/kexec_handover.c +F: tools/testing/selftests/kho/ KEYS-ENCRYPTED M: Mimi Zohar <zohar@linux.ibm.com> @@ -19733,6 +19742,16 @@ S: Maintained F: include/linux/delayacct.h F: kernel/delayacct.c +TASK DELAY MONITORING TOOLS +M: Andrew Morton <akpm@linux-foundation.org> +M: Wang Yaxin <wang.yaxin@zte.com.cn> +M: Fan Yu <fan.yu9@zte.com.cn> +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/accounting/delay-accounting.rst +F: tools/accounting/delaytop.c +F: tools/accounting/getdelays.c + PERFORMANCE EVENTS SUBSYSTEM M: Peter Zijlstra <peterz@infradead.org> M: Ingo Molnar <mingo@redhat.com> @@ -22032,6 +22051,10 @@ K: \b(?i:rust)\b RUST [ALLOC] M: Danilo Krummrich <dakr@kernel.org> +R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> +R: Vlastimil Babka <vbabka@suse.cz> +R: Liam R. Howlett <Liam.Howlett@oracle.com> +R: Uladzislau Rezki <urezki@gmail.com> L: rust-for-linux@vger.kernel.org S: Maintained T: git https://github.com/Rust-for-Linux/linux.git alloc-next @@ -23370,6 +23393,7 @@ F: drivers/md/md* F: drivers/md/raid* F: include/linux/raid/ F: include/uapi/linux/raid/ +F: lib/raid6/ SOLIDRUN CLEARFOG SUPPORT M: Russell King <linux@armlinux.org.uk> @@ -479,11 +479,17 @@ export rust_common_flags := --edition=2021 \ -Wrust_2018_idioms \ -Wunreachable_pub \ -Wclippy::all \ + -Wclippy::as_ptr_cast_mut \ + -Wclippy::as_underscore \ + -Wclippy::cast_lossless \ -Wclippy::ignored_unit_patterns \ -Wclippy::mut_mut \ -Wclippy::needless_bitwise_bool \ -Aclippy::needless_lifetimes \ -Wclippy::no_mangle_with_rust_abi \ + -Wclippy::ptr_as_ptr \ + -Wclippy::ptr_cast_constness \ + -Wclippy::ref_as_ptr \ -Wclippy::undocumented_unsafe_blocks \ -Wclippy::unnecessary_safety_comment \ -Wclippy::unnecessary_safety_doc \ diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index b1bfbd11980d..d38f4d6759e4 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -17,6 +17,7 @@ #include <linux/vmalloc.h> #include <linux/mc146818rtc.h> #include <linux/rtc.h> +#include <linux/string.h> #include <linux/module.h> #include <linux/memblock.h> @@ -79,10 +80,12 @@ mk_resource_name(int pe, int port, char *str) { char tmp[80]; char *name; - - sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port); - name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES); - strcpy(name, tmp); + size_t sz; + + sz = scnprintf(tmp, sizeof(tmp), "PCI %s PE %d PORT %d", str, pe, port); + sz += 1; /* NUL terminator */ + name = memblock_alloc_or_panic(sz, SMP_CACHE_BYTES); + strscpy(name, tmp, sz); return name; } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index a41c93988d2c..0bfd66c7ada0 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1004,7 +1004,7 @@ static void __init reserve_crashkernel(void) total_mem = get_total_mem(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, - NULL, NULL); + NULL, NULL, NULL); /* invalid value specified or crashkernel=0 */ if (ret || !crash_size) return; diff --git a/arch/arm64/include/asm/asm-bug.h b/arch/arm64/include/asm/asm-bug.h index 6e73809f6492..a5f13801b784 100644 --- a/arch/arm64/include/asm/asm-bug.h +++ b/arch/arm64/include/asm/asm-bug.h @@ -21,16 +21,21 @@ #endif #ifdef CONFIG_GENERIC_BUG - -#define __BUG_ENTRY(flags) \ +#define __BUG_ENTRY_START \ .pushsection __bug_table,"aw"; \ .align 2; \ 14470: .long 14471f - .; \ -_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ - .short flags; \ + +#define __BUG_ENTRY_END \ .align 2; \ .popsection; \ 14471: + +#define __BUG_ENTRY(flags) \ + __BUG_ENTRY_START \ +_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ + .short flags; \ + __BUG_ENTRY_END #else #define __BUG_ENTRY(flags) #endif @@ -41,4 +46,24 @@ _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ #define ASM_BUG() ASM_BUG_FLAGS(0) +#ifdef CONFIG_DEBUG_BUGVERBOSE +#define __BUG_LOCATION_STRING(file, line) \ + ".long " file "- .;" \ + ".short " line ";" +#else +#define __BUG_LOCATION_STRING(file, line) +#endif + +#define __BUG_ENTRY_STRING(file, line, flags) \ + __stringify(__BUG_ENTRY_START) \ + __BUG_LOCATION_STRING(file, line) \ + ".short " flags ";" \ + __stringify(__BUG_ENTRY_END) + +#define ARCH_WARN_ASM(file, line, flags, size) \ + __BUG_ENTRY_STRING(file, line, flags) \ + __stringify(brk BUG_BRK_IMM) + +#define ARCH_WARN_REACHABLE + #endif /* __ASM_ASM_BUG_H */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0c8c35dd645e..ea84a61ed508 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index b99fbb388fe0..22b27cd447a1 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -265,7 +265,7 @@ static void __init arch_reserve_crashkernel(void) return; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base, &low_size, &high); + &crash_size, &crash_base, &low_size, NULL, &high); if (ret) return; diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index fbfe0771317e..11b9b6b63e19 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -458,7 +458,7 @@ static void __init mips_parse_crashkernel(void) total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, - NULL, NULL); + NULL, NULL, NULL); if (ret != 0 || crash_size <= 0) return; diff --git a/arch/openrisc/include/asm/mmu.h b/arch/openrisc/include/asm/mmu.h index eb720110f3a2..e7826a681bc4 100644 --- a/arch/openrisc/include/asm/mmu.h +++ b/arch/openrisc/include/asm/mmu.h @@ -15,7 +15,7 @@ #ifndef __ASM_OPENRISC_MMU_H #define __ASM_OPENRISC_MMU_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef unsigned long mm_context_t; #endif diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index c589e96035e1..85797f94d1d7 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -25,7 +25,7 @@ */ #include <asm/setup.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) @@ -55,10 +55,10 @@ typedef struct page *pgtable_t; #define __pgd(x) ((pgd_t) { (x) }) #define __pgprot(x) ((pgprot_t) { (x) }) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __va(x) ((void *)((unsigned long)(x) + PAGE_OFFSET)) #define __pa(x) ((unsigned long) (x) - PAGE_OFFSET) @@ -73,7 +73,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) (pfn_valid(virt_to_pfn(kaddr))) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 5bd6463bd514..d33702831505 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -23,7 +23,7 @@ #include <asm-generic/pgtable-nopmd.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/mmu.h> #include <asm/fixmap.h> @@ -430,5 +430,5 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) typedef pte_t *pte_addr_t; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_OPENRISC_PGTABLE_H */ diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index e05d1b59e24e..3ff893a67c13 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -39,7 +39,7 @@ */ #define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct task_struct; @@ -78,5 +78,5 @@ void show_registers(struct pt_regs *regs); #define cpu_relax() barrier() -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_OPENRISC_PROCESSOR_H */ diff --git a/arch/openrisc/include/asm/ptrace.h b/arch/openrisc/include/asm/ptrace.h index e5a282b67075..28facf2f3e00 100644 --- a/arch/openrisc/include/asm/ptrace.h +++ b/arch/openrisc/include/asm/ptrace.h @@ -27,7 +27,7 @@ * they share a cacheline (not done yet, though... future optimization). */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This struct describes how the registers are laid out on the kernel stack * during a syscall or other kernel entry. @@ -147,7 +147,7 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, return *(unsigned long *)((unsigned long)regs + offset); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Offsets used by 'ptrace' system call interface. diff --git a/arch/openrisc/include/asm/setup.h b/arch/openrisc/include/asm/setup.h index 9acbc5deda69..dce9f4d3b378 100644 --- a/arch/openrisc/include/asm/setup.h +++ b/arch/openrisc/include/asm/setup.h @@ -8,7 +8,7 @@ #include <linux/init.h> #include <asm-generic/setup.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ void __init or1k_early_setup(void *fdt); #endif diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 4af3049c34c2..e338fff7efb0 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -17,7 +17,7 @@ #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/types.h> #include <asm/processor.h> #endif @@ -38,7 +38,7 @@ * - if the contents of this structure are changed, the assembly constants * must also be changed */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct thread_info { struct task_struct *task; /* main task structure */ @@ -58,7 +58,7 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -75,7 +75,7 @@ register struct thread_info *current_thread_info_reg asm("r10"); #define get_thread_info(ti) get_task_struct((ti)->task) #define put_thread_info(ti) put_task_struct((ti)->task) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * thread information flags diff --git a/arch/openrisc/include/uapi/asm/ptrace.h b/arch/openrisc/include/uapi/asm/ptrace.h index a77cc9915ca8..1f12a60d5a06 100644 --- a/arch/openrisc/include/uapi/asm/ptrace.h +++ b/arch/openrisc/include/uapi/asm/ptrace.h @@ -20,7 +20,7 @@ #ifndef _UAPI__ASM_OPENRISC_PTRACE_H #define _UAPI__ASM_OPENRISC_PTRACE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This is the layout of the regset returned by the GETREGSET ptrace call */ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 4312bcb913a4..8053b24afc39 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -425,6 +425,7 @@ #define PPC_RAW_SC() (0x44000002) #define PPC_RAW_SYNC() (0x7c0004ac) #define PPC_RAW_ISYNC() (0x4c00012c) +#define PPC_RAW_LWSYNC() (0x7c2004ac) /* * Define what the VSX XX1 form instructions will look like, then add diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 13578f4db254..bb836f02101c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1139,6 +1139,7 @@ int eeh_unfreeze_pe(struct eeh_pe *pe) return ret; } +EXPORT_SYMBOL_GPL(eeh_unfreeze_pe); static struct pci_device_id eeh_reset_ids[] = { diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 10ce6b3bd3b7..48ad0116f359 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -257,13 +257,12 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, struct pci_driver *driver; enum pci_ers_result new_result; - pci_lock_rescan_remove(); pdev = edev->pdev; if (pdev) get_device(&pdev->dev); - pci_unlock_rescan_remove(); if (!pdev) { eeh_edev_info(edev, "no device"); + *result = PCI_ERS_RESULT_DISCONNECT; return; } device_lock(&pdev->dev); @@ -304,8 +303,9 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root, struct eeh_dev *edev, *tmp; pr_info("EEH: Beginning: '%s'\n", name); - eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) - eeh_pe_report_edev(edev, fn, result); + eeh_for_each_pe(root, pe) + eeh_pe_for_each_dev(pe, edev, tmp) + eeh_pe_report_edev(edev, fn, result); if (result) pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n", name, pci_ers_result_name(*result)); @@ -383,6 +383,8 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) if (!edev) return; + pci_lock_rescan_remove(); + /* * The content in the config space isn't saved because * the blocked config space on some adapters. We have @@ -393,14 +395,19 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); + pci_unlock_rescan_remove(); return; } pdev = eeh_dev_to_pci_dev(edev); - if (!pdev) + if (!pdev) { + pci_unlock_rescan_remove(); return; + } pci_restore_state(pdev); + + pci_unlock_rescan_remove(); } /** @@ -647,9 +654,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); } else { - pci_lock_rescan_remove(); pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); } /* @@ -665,8 +670,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (rc) return rc; - pci_lock_rescan_remove(); - /* Restore PE */ eeh_ops->configure_bridge(pe); eeh_pe_restore_bars(pe); @@ -674,7 +677,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, /* Clear frozen state */ rc = eeh_clear_pe_frozen_state(pe, false); if (rc) { - pci_unlock_rescan_remove(); return rc; } @@ -709,7 +711,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, pe->tstamp = tstamp; pe->freeze_count = cnt; - pci_unlock_rescan_remove(); return 0; } @@ -843,10 +844,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe) {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; int devices = 0; + pci_lock_rescan_remove(); + bus = eeh_pe_bus_get(pe); if (!bus) { pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", __func__, pe->phb->global_number, pe->addr); + pci_unlock_rescan_remove(); return; } @@ -1094,10 +1098,15 @@ recover_failed: eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); + bus = eeh_pe_bus_get(pe); + if (bus) + pci_hp_remove_devices(bus); + else + pr_err("%s: PCI bus for PHB#%x-PE#%x disappeared\n", + __func__, pe->phb->global_number, pe->addr); + /* The passed PE should no longer be used */ + pci_unlock_rescan_remove(); return; } @@ -1114,6 +1123,8 @@ out: eeh_clear_slot_attention(edev->pdev); eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); + + pci_unlock_rescan_remove(); } /** @@ -1132,6 +1143,7 @@ void eeh_handle_special_event(void) unsigned long flags; int rc; + pci_lock_rescan_remove(); do { rc = eeh_ops->next_error(&pe); @@ -1171,10 +1183,12 @@ void eeh_handle_special_event(void) break; case EEH_NEXT_ERR_NONE: + pci_unlock_rescan_remove(); return; default: pr_warn("%s: Invalid value %d from next_error()\n", __func__, rc); + pci_unlock_rescan_remove(); return; } @@ -1186,7 +1200,9 @@ void eeh_handle_special_event(void) if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + pci_unlock_rescan_remove(); eeh_handle_normal_event(pe); + pci_lock_rescan_remove(); } else { eeh_for_each_pe(pe, tmp_pe) eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) @@ -1199,7 +1215,6 @@ void eeh_handle_special_event(void) eeh_report_failure, NULL); eeh_set_channel_state(pe, pci_channel_io_perm_failure); - pci_lock_rescan_remove(); list_for_each_entry(hose, &hose_list, list_node) { phb_pe = eeh_phb_pe_get(hose); if (!phb_pe || @@ -1218,7 +1233,6 @@ void eeh_handle_special_event(void) } pci_hp_remove_devices(bus); } - pci_unlock_rescan_remove(); } /* @@ -1228,4 +1242,6 @@ void eeh_handle_special_event(void) if (rc == EEH_NEXT_ERR_DEAD_IOC) break; } while (rc != EEH_NEXT_ERR_NONE); + + pci_unlock_rescan_remove(); } diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index d283d281d28e..e740101fadf3 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -671,10 +671,12 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) eeh_ops->write_config(edev, cap + PCI_EXP_LNKCTL, 2, val); /* Check link */ - if (!edev->pdev->link_active_reporting) { - eeh_edev_dbg(edev, "No link reporting capability\n"); - msleep(1000); - return; + if (edev->pdev) { + if (!edev->pdev->link_active_reporting) { + eeh_edev_dbg(edev, "No link reporting capability\n"); + msleep(1000); + return; + } } /* Wait the link is up until timeout (5s) */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8a050f30e6d9..5782e743fd27 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -333,7 +333,7 @@ static __init u64 fadump_calculate_reserve_size(void) * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &size, &base, NULL, NULL); + &size, &base, NULL, NULL, NULL); if (ret == 0 && size > 0) { unsigned long max_size; diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 9ea74973d78d..6f444d0822d8 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -141,6 +141,9 @@ void pci_hp_add_devices(struct pci_bus *bus) struct pci_controller *phb; struct device_node *dn = pci_bus_to_OF_node(bus); + if (!dn) + return; + phb = pci_bus_to_host(bus); mode = PCI_PROBE_NORMAL; diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 00e9c267b912..d1a2d755381c 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -110,7 +110,7 @@ void __init arch_reserve_crashkernel(void) /* use common parsing */ ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size, - &crash_base, NULL, NULL); + &crash_base, NULL, NULL, NULL); if (ret) return; diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 5c8d1bb98b3e..5e4897daaaea 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size) int ret; ret = parse_crashkernel(boot_command_line, size, &crash_size, - &crash_base, NULL, NULL); + &crash_base, NULL, NULL, NULL); if (ret != 0 || crash_size == 0) return; if (crash_base == 0) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index a25a6ffe7d7c..025524378443 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -409,6 +409,71 @@ asm ( " blr ;" ); +static int emit_atomic_ld_st(const struct bpf_insn insn, struct codegen_context *ctx, u32 *image) +{ + u32 code = insn.code; + u32 dst_reg = bpf_to_ppc(insn.dst_reg); + u32 src_reg = bpf_to_ppc(insn.src_reg); + u32 size = BPF_SIZE(code); + u32 tmp1_reg = bpf_to_ppc(TMP_REG_1); + u32 tmp2_reg = bpf_to_ppc(TMP_REG_2); + s16 off = insn.off; + s32 imm = insn.imm; + + switch (imm) { + case BPF_LOAD_ACQ: + switch (size) { + case BPF_B: + EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); + break; + case BPF_DW: + if (off % 4) { + EMIT(PPC_RAW_LI(tmp1_reg, off)); + EMIT(PPC_RAW_LDX(dst_reg, src_reg, tmp1_reg)); + } else { + EMIT(PPC_RAW_LD(dst_reg, src_reg, off)); + } + break; + } + EMIT(PPC_RAW_LWSYNC()); + break; + case BPF_STORE_REL: + EMIT(PPC_RAW_LWSYNC()); + switch (size) { + case BPF_B: + EMIT(PPC_RAW_STB(src_reg, dst_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_STH(src_reg, dst_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_STW(src_reg, dst_reg, off)); + break; + case BPF_DW: + if (off % 4) { + EMIT(PPC_RAW_LI(tmp2_reg, off)); + EMIT(PPC_RAW_STDX(src_reg, dst_reg, tmp2_reg)); + } else { + EMIT(PPC_RAW_STD(src_reg, dst_reg, off)); + } + break; + } + break; + default: + pr_err_ratelimited("unexpected atomic load/store op code %02x\n", + imm); + return -EINVAL; + } + + return 0; +} + /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct codegen_context *ctx, u32 *addrs, int pass, bool extra_pass) @@ -898,8 +963,25 @@ emit_clear: /* * BPF_STX ATOMIC (atomic ops) */ + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(&insn[i])) { + ret = emit_atomic_ld_st(insn[i], ctx, image); + if (ret) + return ret; + + if (size != BPF_DW && insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; + break; + } else if (size == BPF_B || size == BPF_H) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + return -EOPNOTSUPP; + } + save_reg = tmp2_reg; ret_reg = src_reg; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e5668d9de58b..a4b233a0659e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -93,6 +93,7 @@ config RISCV select CLINT_TIMER if RISCV_M_MODE select CLONE_BACKWARDS select COMMON_CLK + select CPU_NO_EFFICIENT_FFS if !RISCV_ISA_ZBB select CPU_PM if CPU_IDLE || HIBERNATION || SUSPEND select DYNAMIC_FTRACE if FUNCTION_TRACER select EDAC_SUPPORT diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index 1aaea81fb141..4c03e20ad11f 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -31,40 +31,45 @@ typedef u32 bug_insn_t; #ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS #define __BUG_ENTRY_ADDR RISCV_INT " 1b - ." -#define __BUG_ENTRY_FILE RISCV_INT " %0 - ." +#define __BUG_ENTRY_FILE(file) RISCV_INT " " file " - ." #else #define __BUG_ENTRY_ADDR RISCV_PTR " 1b" -#define __BUG_ENTRY_FILE RISCV_PTR " %0" +#define __BUG_ENTRY_FILE(file) RISCV_PTR " " file #endif #ifdef CONFIG_DEBUG_BUGVERBOSE -#define __BUG_ENTRY \ +#define __BUG_ENTRY(file, line, flags) \ __BUG_ENTRY_ADDR "\n\t" \ - __BUG_ENTRY_FILE "\n\t" \ - RISCV_SHORT " %1\n\t" \ - RISCV_SHORT " %2" + __BUG_ENTRY_FILE(file) "\n\t" \ + RISCV_SHORT " " line "\n\t" \ + RISCV_SHORT " " flags #else -#define __BUG_ENTRY \ - __BUG_ENTRY_ADDR "\n\t" \ - RISCV_SHORT " %2" +#define __BUG_ENTRY(file, line, flags) \ + __BUG_ENTRY_ADDR "\n\t" \ + RISCV_SHORT " " flags #endif #ifdef CONFIG_GENERIC_BUG -#define __BUG_FLAGS(flags) \ -do { \ - __asm__ __volatile__ ( \ + +#define ARCH_WARN_ASM(file, line, flags, size) \ "1:\n\t" \ "ebreak\n" \ ".pushsection __bug_table,\"aw\"\n\t" \ "2:\n\t" \ - __BUG_ENTRY "\n\t" \ - ".org 2b + %3\n\t" \ + __BUG_ENTRY(file, line, flags) "\n\t" \ + ".org 2b + " size "\n\t" \ ".popsection" \ + +#define __BUG_FLAGS(flags) \ +do { \ + __asm__ __volatile__ ( \ + ARCH_WARN_ASM("%0", "%1", "%2", "%3") \ : \ : "i" (__FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) + #else /* CONFIG_GENERIC_BUG */ #define __BUG_FLAGS(flags) do { \ __asm__ __volatile__ ("ebreak\n"); \ @@ -78,6 +83,8 @@ do { \ #define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) +#define ARCH_WARN_REACHABLE + #define HAVE_ARCH_BUG #include <asm-generic/bug.h> diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c index f4755d49b89e..56444c7bd34e 100644 --- a/arch/riscv/kernel/kexec_elf.c +++ b/arch/riscv/kernel/kexec_elf.c @@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, kbuf.buf_align = PMD_SIZE; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); + kbuf.cma = NULL; kbuf.top_down = false; ret = arch_kexec_locate_mem_hole(&kbuf); if (!ret) { diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 14888e5ea19a..f90cce7a3ace 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -21,6 +21,8 @@ #include <linux/efi.h> #include <linux/crash_dump.h> #include <linux/panic_notifier.h> +#include <linux/jump_label.h> +#include <linux/gcd.h> #include <asm/acpi.h> #include <asm/alternative.h> @@ -362,6 +364,9 @@ void __init setup_arch(char **cmdline_p) riscv_user_isa_enable(); riscv_spinlock_init(); + + if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB) || !riscv_isa_extension_available(NULL, ZBB)) + static_branch_disable(&efficient_ffs_key); } bool arch_cpu_is_hotpluggable(int cpu) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8d0374d7ce8e..15683ae13fa5 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1408,7 +1408,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f244c5560e7f..b99aeb0db2ee 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -605,7 +605,7 @@ static void __init reserve_crashkernel(void) int rc; rc = parse_crashkernel(boot_command_line, ident_map_size, - &crash_size, &crash_base, NULL, NULL); + &crash_size, &crash_base, NULL, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index 8321b31d2e19..37073ca1e0ad 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -146,7 +146,7 @@ void __init reserve_crashkernel(void) return; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base, NULL, NULL); + &crash_size, &crash_base, NULL, NULL, NULL); if (ret == 0 && crash_size > 0) { crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index f0e9acf72547..20fcb8507ad1 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -32,45 +32,42 @@ #ifdef CONFIG_GENERIC_BUG #ifdef CONFIG_X86_32 -# define __BUG_REL(val) ".long " __stringify(val) +# define __BUG_REL(val) ".long " val #else -# define __BUG_REL(val) ".long " __stringify(val) " - ." +# define __BUG_REL(val) ".long " val " - ." #endif #ifdef CONFIG_DEBUG_BUGVERBOSE +#define __BUG_ENTRY(file, line, flags) \ + "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \ + "\t" __BUG_REL(file) "\t# bug_entry::file\n" \ + "\t.word " line "\t# bug_entry::line\n" \ + "\t.word " flags "\t# bug_entry::flags\n" +#else +#define __BUG_ENTRY(file, line, flags) \ + "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \ + "\t.word " flags "\t# bug_entry::flags\n" +#endif + +#define _BUG_FLAGS_ASM(ins, file, line, flags, size, extra) \ + "1:\t" ins "\n" \ + ".pushsection __bug_table,\"aw\"\n" \ + __BUG_ENTRY(file, line, flags) \ + "\t.org 2b + " size "\n" \ + ".popsection\n" \ + extra #define _BUG_FLAGS(ins, flags, extra) \ do { \ - asm_inline volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ - "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ - "\t.word %c1" "\t# bug_entry::line\n" \ - "\t.word %c2" "\t# bug_entry::flags\n" \ - "\t.org 2b+%c3\n" \ - ".popsection\n" \ - extra \ + asm_inline volatile(_BUG_FLAGS_ASM(ins, "%c0", \ + "%c1", "%c2", "%c3", extra) \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) -#else /* !CONFIG_DEBUG_BUGVERBOSE */ - -#define _BUG_FLAGS(ins, flags, extra) \ -do { \ - asm_inline volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ - "\t.word %c0" "\t# bug_entry::flags\n" \ - "\t.org 2b+%c1\n" \ - ".popsection\n" \ - extra \ - : : "i" (flags), \ - "i" (sizeof(struct bug_entry))); \ -} while (0) - -#endif /* CONFIG_DEBUG_BUGVERBOSE */ +#define ARCH_WARN_ASM(file, line, flags, size) \ + _BUG_FLAGS_ASM(ASM_UD2, file, line, flags, size, "") #else @@ -92,11 +89,14 @@ do { \ * were to trigger, we'd rather wreck the machine in an attempt to get the * message out than not know about it. */ + +#define ARCH_WARN_REACHABLE ANNOTATE_REACHABLE(1b) + #define __WARN_FLAGS(flags) \ do { \ __auto_type __flags = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \ + _BUG_FLAGS(ASM_UD2, __flags, ARCH_WARN_REACHABLE); \ instrumentation_end(); \ } while (0) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index bcb534688dfe..c6b12bed173d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -163,10 +163,10 @@ static struct crash_mem *fill_up_crash_elf_data(void) return NULL; /* - * Exclusion of crash region and/or crashk_low_res may cause - * another range split. So add extra two slots here. + * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges + * may cause range splits. So add extra slots here. */ - nr_ranges += 2; + nr_ranges += 2 + crashk_cma_cnt; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; @@ -184,6 +184,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) static int elf_header_exclude_ranges(struct crash_mem *cmem) { int ret = 0; + int i; /* Exclude the low 1M because it is always reserved */ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1); @@ -198,8 +199,17 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) if (crashk_low_res.end) ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; - return ret; + for (i = 0; i < crashk_cma_cnt; ++i) { + ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + if (ret) + return ret; + } + + return 0; } static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) @@ -374,6 +384,14 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) add_e820_entry(params, &ei); } + for (i = 0; i < crashk_cma_cnt; ++i) { + ei.addr = crashk_cma_ranges[i].start; + ei.size = crashk_cma_ranges[i].end - + crashk_cma_ranges[i].start + 1; + ei.type = E820_TYPE_RAM; + add_e820_entry(params, &ei); + } + out: vfree(cmem); return ret; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0792f31961ac..1b2edd07a3e1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -603,7 +603,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) static void __init arch_reserve_crashkernel(void) { - unsigned long long crash_base, crash_size, low_size = 0; + unsigned long long crash_base, crash_size, low_size = 0, cma_size = 0; bool high = false; int ret; @@ -612,7 +612,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, &cma_size, &high); if (ret) return; @@ -622,6 +622,7 @@ static void __init arch_reserve_crashkernel(void) } reserve_crashkernel_generic(crash_size, crash_base, low_size, high); + reserve_crashkernel_cma(cma_size); } static struct resource standard_io_resources[] = { diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d1b79b418c05..850972deac8e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -641,7 +641,7 @@ static void kvm_pit_reset(struct kvm_pit *pit) kvm_pit_reset_reinject(pit); } -static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) +static void pit_mask_notifier(struct kvm_irq_mask_notifier *kimn, bool mask) { struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); @@ -763,7 +763,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; - pit->mask_notifier.func = pit_mask_notifer; + pit->mask_notifier.func = pit_mask_notifier; kvm_pit_reset(pit); diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index 5e2b2680d7db..9e4bb7fbde25 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -119,7 +119,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int *offsets, int disks, for (i = 0; i < disks; i++) { if (blocks[i] == NULL) { BUG_ON(i > disks - 3); /* P or Q can't be zero */ - srcs[i] = (void*)raid6_empty_zero_page; + srcs[i] = raid6_get_zero_page(); } else { srcs[i] = page_address(blocks[i]) + offsets[i]; diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c index 354b8cd5537f..539ea5b378dc 100644 --- a/crypto/async_tx/async_raid6_recov.c +++ b/crypto/async_tx/async_raid6_recov.c @@ -414,7 +414,7 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) if (blocks[i] == NULL) - ptrs[i] = (void *) raid6_empty_zero_page; + ptrs[i] = raid6_get_zero_page(); else ptrs[i] = page_address(blocks[i]) + offs[i]; @@ -497,7 +497,7 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila, async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) if (blocks[i] == NULL) - ptrs[i] = (void*)raid6_empty_zero_page; + ptrs[i] = raid6_get_zero_page(); else ptrs[i] = page_address(blocks[i]) + offs[i]; diff --git a/drivers/cpufreq/rcpufreq_dt.rs b/drivers/cpufreq/rcpufreq_dt.rs index 9ad85fe6fd05..7e1fbf9a091f 100644 --- a/drivers/cpufreq/rcpufreq_dt.rs +++ b/drivers/cpufreq/rcpufreq_dt.rs @@ -9,7 +9,6 @@ use kernel::{ cpumask::CpumaskVar, device::{Core, Device}, error::code::*, - fmt, macros::vtable, module_platform_driver, of, opp, platform, prelude::*, @@ -19,7 +18,7 @@ use kernel::{ /// Finds exact supply name from the OF node. fn find_supply_name_exact(dev: &Device, name: &str) -> Option<CString> { - let prop_name = CString::try_from_fmt(fmt!("{}-supply", name)).ok()?; + let prop_name = CString::try_from_fmt(fmt!("{name}-supply")).ok()?; dev.fwnode()? .property_present(&prop_name) .then(|| CString::try_from_fmt(fmt!("{name}")).ok()) @@ -221,7 +220,7 @@ impl platform::Driver for CPUFreqDTDriver { module_platform_driver! { type: CPUFreqDTDriver, name: "cpufreq-dt", - author: "Viresh Kumar <viresh.kumar@linaro.org>", + authors: ["Viresh Kumar <viresh.kumar@linaro.org>"], description: "Generic CPUFreq DT driver", license: "GPL v2", } diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h index ace73424eeb6..ca272e8db6c7 100644 --- a/drivers/cxl/core/mce.h +++ b/drivers/cxl/core/mce.h @@ -7,7 +7,7 @@ #ifdef CONFIG_CXL_MCE int devm_cxl_register_mce_notifier(struct device *dev, - struct notifier_block *mce_notifer); + struct notifier_block *mce_notifier); #else static inline int devm_cxl_register_mce_notifier(struct device *dev, diff --git a/drivers/gpu/drm/drm_panic_qr.rs b/drivers/gpu/drm/drm_panic_qr.rs index 18492daae4b3..09a9b452e8b7 100644 --- a/drivers/gpu/drm/drm_panic_qr.rs +++ b/drivers/gpu/drm/drm_panic_qr.rs @@ -404,7 +404,7 @@ impl DecFifo { let mut out = 0; let mut exp = 1; for i in 0..poplen { - out += self.decimals[self.len + i] as u16 * exp; + out += u16::from(self.decimals[self.len + i]) * exp; exp *= 10; } Some((out, NUM_CHARS_BITS[poplen])) @@ -425,7 +425,7 @@ impl Iterator for SegmentIterator<'_> { match self.segment { Segment::Binary(data) => { if self.offset < data.len() { - let byte = data[self.offset] as u16; + let byte = u16::from(data[self.offset]); self.offset += 1; Some((byte, 8)) } else { diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c index e8a04e476c57..09a64f224c49 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c @@ -220,8 +220,7 @@ static int guc_action_control_log(struct intel_guc *guc, bool enable, */ static int subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding) + void *prev_subbuf) { /* * Use no-overwrite mode by default, where relay will stop accepting diff --git a/drivers/gpu/drm/nova/nova.rs b/drivers/gpu/drm/nova/nova.rs index 902876aa14d1..64fd670e99e1 100644 --- a/drivers/gpu/drm/nova/nova.rs +++ b/drivers/gpu/drm/nova/nova.rs @@ -12,7 +12,7 @@ use crate::driver::NovaDriver; kernel::module_auxiliary_driver! { type: NovaDriver, name: "Nova", - author: "Danilo Krummrich", + authors: ["Danilo Krummrich"], description: "Nova GPU driver", license: "GPL v2", } diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index bed6088e1bb3..8a07feef503b 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -266,7 +266,7 @@ struct xe_vm { * up for revalidation. Protected from access with the * @invalidated_lock. Removing items from the list * additionally requires @lock in write mode, and adding - * items to the list requires either the @userptr.notifer_lock in + * items to the list requires either the @userptr.notifier_lock in * write mode, OR @lock in write mode. */ struct list_head invalidated; diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs index 5749bad9c285..274989ea1fb4 100644 --- a/drivers/gpu/nova-core/driver.rs +++ b/drivers/gpu/nova-core/driver.rs @@ -19,7 +19,7 @@ kernel::pci_device_table!( MODULE_PCI_TABLE, <NovaCore as pci::Driver>::IdInfo, [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_NVIDIA, bindings::PCI_ANY_ID as _), + pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_NVIDIA, bindings::PCI_ANY_ID as u32), () )] ); diff --git a/drivers/gpu/nova-core/firmware.rs b/drivers/gpu/nova-core/firmware.rs index 0fdece652587..2931912ddba0 100644 --- a/drivers/gpu/nova-core/firmware.rs +++ b/drivers/gpu/nova-core/firmware.rs @@ -30,11 +30,12 @@ pub(crate) struct Firmware { impl Firmware { pub(crate) fn new(dev: &device::Device, chipset: Chipset, ver: &str) -> Result<Firmware> { - let mut chip_name = CString::try_from_fmt(fmt!("{}", chipset))?; + let mut chip_name = CString::try_from_fmt(fmt!("{chipset}"))?; chip_name.make_ascii_lowercase(); + let chip_name = &*chip_name; let request = |name_| { - CString::try_from_fmt(fmt!("nvidia/{}/gsp/{}-{}.bin", &*chip_name, name_, ver)) + CString::try_from_fmt(fmt!("nvidia/{chip_name}/gsp/{name_}-{ver}.bin")) .and_then(|path| firmware::Firmware::request(&path, dev)) }; diff --git a/drivers/gpu/nova-core/nova_core.rs b/drivers/gpu/nova-core/nova_core.rs index de14f2e92636..cb2bbb30cba1 100644 --- a/drivers/gpu/nova-core/nova_core.rs +++ b/drivers/gpu/nova-core/nova_core.rs @@ -18,7 +18,7 @@ pub(crate) const MODULE_NAME: &kernel::str::CStr = <LocalModule as kernel::Modul kernel::module_pci_driver! { type: driver::NovaCore, name: "NovaCore", - author: "Danilo Krummrich", + authors: ["Danilo Krummrich"], description: "Nova Core GPU driver", license: "GPL v2", firmware: [], diff --git a/drivers/gpu/nova-core/regs.rs b/drivers/gpu/nova-core/regs.rs index 5ccfb61f850a..d49fddf6a3c6 100644 --- a/drivers/gpu/nova-core/regs.rs +++ b/drivers/gpu/nova-core/regs.rs @@ -36,7 +36,7 @@ impl NV_PMC_BOOT_0 { pub(crate) fn chipset(self) -> Result<Chipset> { self.architecture() .map(|arch| { - ((arch as u32) << Self::IMPLEMENTATION.len()) | self.implementation() as u32 + ((arch as u32) << Self::IMPLEMENTATION.len()) | u32::from(self.implementation()) }) .and_then(Chipset::try_from) } diff --git a/drivers/gpu/nova-core/regs/macros.rs b/drivers/gpu/nova-core/regs/macros.rs index cdf668073480..a3e6de1779d4 100644 --- a/drivers/gpu/nova-core/regs/macros.rs +++ b/drivers/gpu/nova-core/regs/macros.rs @@ -307,7 +307,7 @@ macro_rules! register { pub(crate) fn [<set_ $field>](mut self, value: $to_type) -> Self { const MASK: u32 = $name::[<$field:upper _MASK>]; const SHIFT: u32 = $name::[<$field:upper _SHIFT>]; - let value = ((value as u32) << SHIFT) & MASK; + let value = (u32::from(value) << SHIFT) & MASK; self.0 = (self.0 & !MASK) | value; self diff --git a/drivers/gpu/nova-core/util.rs b/drivers/gpu/nova-core/util.rs index 64fb13760764..76cedf3710d7 100644 --- a/drivers/gpu/nova-core/util.rs +++ b/drivers/gpu/nova-core/util.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 use kernel::prelude::*; -use kernel::time::{Delta, Instant}; +use kernel::time::{Delta, Instant, Monotonic}; pub(crate) const fn to_lowercase_bytes<const N: usize>(s: &str) -> [u8; N] { let src = s.as_bytes(); @@ -33,7 +33,7 @@ pub(crate) const fn const_bytes_to_str(bytes: &[u8]) -> &str { /// TODO[DLAY]: replace with `read_poll_timeout` once it is available. /// (https://lore.kernel.org/lkml/20250220070611.214262-8-fujita.tomonori@gmail.com/) pub(crate) fn wait_on<R, F: Fn() -> Option<R>>(timeout: Delta, cond: F) -> Result<R> { - let start_time = Instant::now(); + let start_time = Instant::<Monotonic>::now(); loop { if let Some(ret) = cond() { diff --git a/drivers/i3c/device.c b/drivers/i3c/device.c index e80e48756914..2396545763ff 100644 --- a/drivers/i3c/device.c +++ b/drivers/i3c/device.c @@ -26,11 +26,12 @@ * * This function can sleep and thus cannot be called in atomic context. * - * Return: 0 in case of success, a negative error core otherwise. - * -EAGAIN: controller lost address arbitration. Target - * (IBI, HJ or controller role request) win the bus. Client - * driver needs to resend the 'xfers' some time later. - * See I3C spec ver 1.1.1 09-Jun-2021. Section: 5.1.2.2.3. + * Return: + * * 0 in case of success, a negative error core otherwise. + * * -EAGAIN: controller lost address arbitration. Target (IBI, HJ or + * controller role request) win the bus. Client driver needs to resend the + * 'xfers' some time later. See I3C spec ver 1.1.1 09-Jun-2021. Section: + * 5.1.2.2.3. */ int i3c_device_do_priv_xfers(struct i3c_device *dev, struct i3c_priv_xfer *xfers, diff --git a/drivers/i3c/internals.h b/drivers/i3c/internals.h index 433f6088b7ce..0d857cc68cc5 100644 --- a/drivers/i3c/internals.h +++ b/drivers/i3c/internals.h @@ -9,6 +9,7 @@ #define I3C_INTERNALS_H #include <linux/i3c/master.h> +#include <linux/io.h> void i3c_bus_normaluse_lock(struct i3c_bus *bus); void i3c_bus_normaluse_unlock(struct i3c_bus *bus); @@ -22,4 +23,41 @@ int i3c_dev_enable_ibi_locked(struct i3c_dev_desc *dev); int i3c_dev_request_ibi_locked(struct i3c_dev_desc *dev, const struct i3c_ibi_setup *req); void i3c_dev_free_ibi_locked(struct i3c_dev_desc *dev); + +/** + * i3c_writel_fifo - Write data buffer to 32bit FIFO + * @addr: FIFO Address to write to + * @buf: Pointer to the data bytes to write + * @nbytes: Number of bytes to write + */ +static inline void i3c_writel_fifo(void __iomem *addr, const void *buf, + int nbytes) +{ + writesl(addr, buf, nbytes / 4); + if (nbytes & 3) { + u32 tmp = 0; + + memcpy(&tmp, buf + (nbytes & ~3), nbytes & 3); + writel(tmp, addr); + } +} + +/** + * i3c_readl_fifo - Read data buffer from 32bit FIFO + * @addr: FIFO Address to read from + * @buf: Pointer to the buffer to store read bytes + * @nbytes: Number of bytes to read + */ +static inline void i3c_readl_fifo(const void __iomem *addr, void *buf, + int nbytes) +{ + readsl(addr, buf, nbytes / 4); + if (nbytes & 3) { + u32 tmp; + + tmp = readl(addr); + memcpy(buf + (nbytes & ~3), &tmp, nbytes & 3); + } +} + #endif /* I3C_INTERNAL_H */ diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index fd81871609d9..2ef898a8fd80 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -141,7 +141,7 @@ static ssize_t bcr_show(struct device *dev, i3c_bus_normaluse_lock(bus); desc = dev_to_i3cdesc(dev); - ret = sprintf(buf, "%x\n", desc->info.bcr); + ret = sprintf(buf, "0x%02x\n", desc->info.bcr); i3c_bus_normaluse_unlock(bus); return ret; @@ -158,7 +158,7 @@ static ssize_t dcr_show(struct device *dev, i3c_bus_normaluse_lock(bus); desc = dev_to_i3cdesc(dev); - ret = sprintf(buf, "%x\n", desc->info.dcr); + ret = sprintf(buf, "0x%02x\n", desc->info.dcr); i3c_bus_normaluse_unlock(bus); return ret; @@ -727,12 +727,12 @@ static int i3c_bus_set_mode(struct i3c_bus *i3cbus, enum i3c_bus_mode mode, switch (i3cbus->mode) { case I3C_BUS_MODE_PURE: if (!i3cbus->scl_rate.i3c) - i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE; + i3cbus->scl_rate.i3c = I3C_BUS_I3C_SCL_TYP_RATE; break; case I3C_BUS_MODE_MIXED_FAST: case I3C_BUS_MODE_MIXED_LIMITED: if (!i3cbus->scl_rate.i3c) - i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE; + i3cbus->scl_rate.i3c = I3C_BUS_I3C_SCL_TYP_RATE; if (!i3cbus->scl_rate.i2c) i3cbus->scl_rate.i2c = max_i2c_scl_rate; break; @@ -754,8 +754,8 @@ static int i3c_bus_set_mode(struct i3c_bus *i3cbus, enum i3c_bus_mode mode, * I3C/I2C frequency may have been overridden, check that user-provided * values are not exceeding max possible frequency. */ - if (i3cbus->scl_rate.i3c > I3C_BUS_MAX_I3C_SCL_RATE || - i3cbus->scl_rate.i2c > I3C_BUS_I2C_FM_PLUS_SCL_RATE) + if (i3cbus->scl_rate.i3c > I3C_BUS_I3C_SCL_MAX_RATE || + i3cbus->scl_rate.i2c > I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE) return -EINVAL; return 0; @@ -837,14 +837,14 @@ static int i3c_master_send_ccc_cmd_locked(struct i3c_master_controller *master, return -EINVAL; if (!master->ops->send_ccc_cmd) - return -ENOTSUPP; + return -EOPNOTSUPP; if ((cmd->id & I3C_CCC_DIRECT) && (!cmd->dests || !cmd->ndests)) return -EINVAL; if (master->ops->supports_ccc_cmd && !master->ops->supports_ccc_cmd(master, cmd)) - return -ENOTSUPP; + return -EOPNOTSUPP; ret = master->ops->send_ccc_cmd(master, cmd); if (ret) { @@ -1439,7 +1439,7 @@ static int i3c_master_retrieve_dev_info(struct i3c_dev_desc *dev) if (dev->info.bcr & I3C_BCR_HDR_CAP) { ret = i3c_master_gethdrcap_locked(master, &dev->info); - if (ret) + if (ret && ret != -EOPNOTSUPP) return ret; } @@ -2210,7 +2210,7 @@ of_i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, */ if (boardinfo->base.flags & I2C_CLIENT_TEN) { dev_err(dev, "I2C device with 10 bit address not supported."); - return -ENOTSUPP; + return -EOPNOTSUPP; } /* LVR is encoded in reg[2]. */ @@ -2340,13 +2340,13 @@ static int i3c_master_i2c_adapter_xfer(struct i2c_adapter *adap, return -EINVAL; if (!master->ops->i2c_xfers) - return -ENOTSUPP; + return -EOPNOTSUPP; /* Doing transfers to different devices is not supported. */ addr = xfers[0].addr; for (i = 1; i < nxfers; i++) { if (addr != xfers[i].addr) - return -ENOTSUPP; + return -EOPNOTSUPP; } i3c_bus_normaluse_lock(&master->bus); @@ -2467,6 +2467,8 @@ static int i3c_i2c_notifier_call(struct notifier_block *nb, unsigned long action case BUS_NOTIFY_DEL_DEVICE: ret = i3c_master_i2c_detach(adap, client); break; + default: + ret = -EINVAL; } i3c_bus_maintenance_unlock(&master->bus); @@ -2766,7 +2768,7 @@ static int i3c_master_check_ops(const struct i3c_master_controller_ops *ops) * controller) * @ops: the master controller operations * @secondary: true if you are registering a secondary master. Will return - * -ENOTSUPP if set to true since secondary masters are not yet + * -EOPNOTSUPP if set to true since secondary masters are not yet * supported * * This function takes care of everything for you: @@ -2785,7 +2787,7 @@ int i3c_master_register(struct i3c_master_controller *master, const struct i3c_master_controller_ops *ops, bool secondary) { - unsigned long i2c_scl_rate = I3C_BUS_I2C_FM_PLUS_SCL_RATE; + unsigned long i2c_scl_rate = I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE; struct i3c_bus *i3cbus = i3c_master_get_bus(master); enum i3c_bus_mode mode = I3C_BUS_MODE_PURE; struct i2c_dev_boardinfo *i2cbi; @@ -2793,7 +2795,7 @@ int i3c_master_register(struct i3c_master_controller *master, /* We do not support secondary masters yet. */ if (secondary) - return -ENOTSUPP; + return -EOPNOTSUPP; ret = i3c_master_check_ops(ops); if (ret) @@ -2844,7 +2846,7 @@ int i3c_master_register(struct i3c_master_controller *master, } if (i2cbi->lvr & I3C_LVR_I2C_FM_MODE) - i2c_scl_rate = I3C_BUS_I2C_FM_SCL_RATE; + i2c_scl_rate = I3C_BUS_I2C_FM_SCL_MAX_RATE; } ret = i3c_bus_set_mode(i3cbus, mode, i2c_scl_rate); @@ -2954,7 +2956,7 @@ int i3c_dev_do_priv_xfers_locked(struct i3c_dev_desc *dev, return -EINVAL; if (!master->ops->priv_xfers) - return -ENOTSUPP; + return -EOPNOTSUPP; return master->ops->priv_xfers(dev, xfers, nxfers); } @@ -3004,7 +3006,7 @@ int i3c_dev_request_ibi_locked(struct i3c_dev_desc *dev, int ret; if (!master->ops->request_ibi) - return -ENOTSUPP; + return -EOPNOTSUPP; if (dev->ibi) return -EBUSY; diff --git a/drivers/i3c/master/Kconfig b/drivers/i3c/master/Kconfig index 7b30db3253af..13df2944f2ec 100644 --- a/drivers/i3c/master/Kconfig +++ b/drivers/i3c/master/Kconfig @@ -64,3 +64,13 @@ config MIPI_I3C_HCI_PCI This driver can also be built as a module. If so, the module will be called mipi-i3c-hci-pci. + +config RENESAS_I3C + tristate "Renesas I3C controller driver" + depends on HAS_IOMEM + depends on ARCH_RENESAS || COMPILE_TEST + help + Support the Renesas I3C controller as found in some RZ variants. + + This driver can also be built as a module. If so, the module will be + called renesas-i3c. diff --git a/drivers/i3c/master/Makefile b/drivers/i3c/master/Makefile index 3e97960160bc..aac74f3e3851 100644 --- a/drivers/i3c/master/Makefile +++ b/drivers/i3c/master/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_DW_I3C_MASTER) += dw-i3c-master.o obj-$(CONFIG_AST2600_I3C_MASTER) += ast2600-i3c-master.o obj-$(CONFIG_SVC_I3C_MASTER) += svc-i3c-master.o obj-$(CONFIG_MIPI_I3C_HCI) += mipi-i3c-hci/ +obj-$(CONFIG_RENESAS_I3C) += renesas-i3c.o diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 611c22b72c15..974122b2d20e 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -23,6 +23,7 @@ #include <linux/reset.h> #include <linux/slab.h> +#include "../internals.h" #include "dw-i3c-master.h" #define DEVICE_CTRL 0x0 @@ -336,37 +337,19 @@ static int dw_i3c_master_get_free_pos(struct dw_i3c_master *master) static void dw_i3c_master_wr_tx_fifo(struct dw_i3c_master *master, const u8 *bytes, int nbytes) { - writesl(master->regs + RX_TX_DATA_PORT, bytes, nbytes / 4); - if (nbytes & 3) { - u32 tmp = 0; - - memcpy(&tmp, bytes + (nbytes & ~3), nbytes & 3); - writesl(master->regs + RX_TX_DATA_PORT, &tmp, 1); - } -} - -static void dw_i3c_master_read_fifo(struct dw_i3c_master *master, - int reg, u8 *bytes, int nbytes) -{ - readsl(master->regs + reg, bytes, nbytes / 4); - if (nbytes & 3) { - u32 tmp; - - readsl(master->regs + reg, &tmp, 1); - memcpy(bytes + (nbytes & ~3), &tmp, nbytes & 3); - } + i3c_writel_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes); } static void dw_i3c_master_read_rx_fifo(struct dw_i3c_master *master, u8 *bytes, int nbytes) { - return dw_i3c_master_read_fifo(master, RX_TX_DATA_PORT, bytes, nbytes); + i3c_readl_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes); } static void dw_i3c_master_read_ibi_fifo(struct dw_i3c_master *master, u8 *bytes, int nbytes) { - return dw_i3c_master_read_fifo(master, IBI_QUEUE_STATUS, bytes, nbytes); + i3c_readl_fifo(master->regs + IBI_QUEUE_STATUS, bytes, nbytes); } static struct dw_i3c_xfer * @@ -622,14 +605,14 @@ static int dw_i2c_clk_cfg(struct dw_i3c_master *master) core_period = DIV_ROUND_UP(1000000000, core_rate); lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FMP_TLOW_MIN_NS, core_period); - hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_PLUS_SCL_RATE) - lcnt; + hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE) - lcnt; scl_timing = SCL_I2C_FMP_TIMING_HCNT(hcnt) | SCL_I2C_FMP_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I2C_FMP_TIMING); master->i2c_fmp_timing = scl_timing; lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FM_TLOW_MIN_NS, core_period); - hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_RATE) - lcnt; + hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_MAX_RATE) - lcnt; scl_timing = SCL_I2C_FM_TIMING_HCNT(hcnt) | SCL_I2C_FM_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I2C_FM_TIMING); @@ -699,7 +682,6 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) dw_i3c_master_enable(master); rpm_out: - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; } @@ -829,7 +811,6 @@ static int dw_i3c_master_send_ccc_cmd(struct i3c_master_controller *m, else ret = dw_i3c_ccc_set(master, ccc); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; } @@ -912,7 +893,6 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m) dw_i3c_master_free_xfer(xfer); rpm_out: - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; } @@ -932,7 +912,7 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev, return 0; if (i3c_nxfers > master->caps.cmdfifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; for (i = 0; i < i3c_nxfers; i++) { if (i3c_xfers[i].rnw) @@ -943,7 +923,7 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev, if (ntxwords > master->caps.datafifodepth || nrxwords > master->caps.datafifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; xfer = dw_i3c_master_alloc_xfer(master, i3c_nxfers); if (!xfer) @@ -998,7 +978,6 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev, ret = xfer->ret; dw_i3c_master_free_xfer(xfer); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; } @@ -1093,7 +1072,7 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, return 0; if (i2c_nxfers > master->caps.cmdfifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; for (i = 0; i < i2c_nxfers; i++) { if (i2c_xfers[i].flags & I2C_M_RD) @@ -1104,7 +1083,7 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, if (ntxwords > master->caps.datafifodepth || nrxwords > master->caps.datafifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; xfer = dw_i3c_master_alloc_xfer(master, i2c_nxfers); if (!xfer) @@ -1142,13 +1121,12 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, } dw_i3c_master_enqueue_xfer(master, xfer); - if (!wait_for_completion_timeout(&xfer->comp, XFER_TIMEOUT)) + if (!wait_for_completion_timeout(&xfer->comp, m->i2c.timeout)) dw_i3c_master_dequeue_xfer(master, xfer); ret = xfer->ret; dw_i3c_master_free_xfer(xfer); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; } @@ -1316,7 +1294,6 @@ static int dw_i3c_master_disable_hotjoin(struct i3c_master_controller *m) writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_HOT_JOIN_NACK, master->regs + DEVICE_CTRL); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return 0; } @@ -1342,7 +1319,6 @@ static int dw_i3c_master_enable_ibi(struct i3c_dev_desc *dev) if (rc) { dw_i3c_master_set_sir_enabled(master, dev, data->index, false); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); } @@ -1362,7 +1338,6 @@ static int dw_i3c_master_disable_ibi(struct i3c_dev_desc *dev) dw_i3c_master_set_sir_enabled(master, dev, data->index, false); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return 0; } diff --git a/drivers/i3c/master/i3c-master-cdns.c b/drivers/i3c/master/i3c-master-cdns.c index fd3752cea654..97b151564d3d 100644 --- a/drivers/i3c/master/i3c-master-cdns.c +++ b/drivers/i3c/master/i3c-master-cdns.c @@ -23,6 +23,8 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> +#include "../internals.h" + #define DEV_ID 0x0 #define DEV_ID_I3C_MASTER 0x5034 @@ -412,7 +414,6 @@ struct cdns_i3c_master { } xferqueue; void __iomem *regs; struct clk *sysclk; - struct clk *pclk; struct cdns_i3c_master_caps caps; unsigned long i3c_scl_lim; const struct cdns_i3c_data *devdata; @@ -427,25 +428,13 @@ to_cdns_i3c_master(struct i3c_master_controller *master) static void cdns_i3c_master_wr_to_tx_fifo(struct cdns_i3c_master *master, const u8 *bytes, int nbytes) { - writesl(master->regs + TX_FIFO, bytes, nbytes / 4); - if (nbytes & 3) { - u32 tmp = 0; - - memcpy(&tmp, bytes + (nbytes & ~3), nbytes & 3); - writesl(master->regs + TX_FIFO, &tmp, 1); - } + i3c_writel_fifo(master->regs + TX_FIFO, bytes, nbytes); } static void cdns_i3c_master_rd_from_rx_fifo(struct cdns_i3c_master *master, u8 *bytes, int nbytes) { - readsl(master->regs + RX_FIFO, bytes, nbytes / 4); - if (nbytes & 3) { - u32 tmp; - - readsl(master->regs + RX_FIFO, &tmp, 1); - memcpy(bytes + (nbytes & ~3), &tmp, nbytes & 3); - } + i3c_readl_fifo(master->regs + RX_FIFO, bytes, nbytes); } static bool cdns_i3c_master_supports_ccc_cmd(struct i3c_master_controller *m, @@ -742,7 +731,7 @@ static int cdns_i3c_master_priv_xfers(struct i3c_dev_desc *dev, for (i = 0; i < nxfers; i++) { if (xfers[i].len > CMD0_FIFO_PL_LEN_MAX) - return -ENOTSUPP; + return -EOPNOTSUPP; } if (!nxfers) @@ -750,7 +739,7 @@ static int cdns_i3c_master_priv_xfers(struct i3c_dev_desc *dev, if (nxfers > master->caps.cmdfifodepth || nxfers > master->caps.cmdrfifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; /* * First make sure that all transactions (block of transfers separated @@ -765,7 +754,7 @@ static int cdns_i3c_master_priv_xfers(struct i3c_dev_desc *dev, if (rxslots > master->caps.rxfifodepth || txslots > master->caps.txfifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; cdns_xfer = cdns_i3c_master_alloc_xfer(master, nxfers); if (!cdns_xfer) @@ -822,11 +811,11 @@ static int cdns_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, int i, ret = 0; if (nxfers > master->caps.cmdfifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; for (i = 0; i < nxfers; i++) { if (xfers[i].len > CMD0_FIFO_PL_LEN_MAX) - return -ENOTSUPP; + return -EOPNOTSUPP; if (xfers[i].flags & I2C_M_RD) nrxwords += DIV_ROUND_UP(xfers[i].len, 4); @@ -836,7 +825,7 @@ static int cdns_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, if (ntxwords > master->caps.txfifodepth || nrxwords > master->caps.rxfifodepth) - return -ENOTSUPP; + return -EOPNOTSUPP; xfer = cdns_i3c_master_alloc_xfer(master, nxfers); if (!xfer) @@ -863,7 +852,7 @@ static int cdns_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, } cdns_i3c_master_queue_xfer(master, xfer); - if (!wait_for_completion_timeout(&xfer->comp, msecs_to_jiffies(1000))) + if (!wait_for_completion_timeout(&xfer->comp, m->i2c.timeout)) cdns_i3c_master_unqueue_xfer(master, xfer); ret = xfer->ret; @@ -1330,12 +1319,7 @@ static void cdns_i3c_master_handle_ibi(struct cdns_i3c_master *master, buf = slot->data; nbytes = IBIR_XFER_BYTES(ibir); - readsl(master->regs + IBI_DATA_FIFO, buf, nbytes / 4); - if (nbytes % 3) { - u32 tmp = __raw_readl(master->regs + IBI_DATA_FIFO); - - memcpy(buf + (nbytes & ~3), &tmp, nbytes & 3); - } + i3c_readl_fifo(master->regs + IBI_DATA_FIFO, buf, nbytes); slot->len = min_t(unsigned int, IBIR_XFER_BYTES(ibir), dev->ibi->max_payload_len); @@ -1566,6 +1550,7 @@ MODULE_DEVICE_TABLE(of, cdns_i3c_master_of_ids); static int cdns_i3c_master_probe(struct platform_device *pdev) { struct cdns_i3c_master *master; + struct clk *pclk; int ret, irq; u32 val; @@ -1581,11 +1566,11 @@ static int cdns_i3c_master_probe(struct platform_device *pdev) if (IS_ERR(master->regs)) return PTR_ERR(master->regs); - master->pclk = devm_clk_get(&pdev->dev, "pclk"); - if (IS_ERR(master->pclk)) - return PTR_ERR(master->pclk); + pclk = devm_clk_get_enabled(&pdev->dev, "pclk"); + if (IS_ERR(pclk)) + return PTR_ERR(pclk); - master->sysclk = devm_clk_get(&pdev->dev, "sysclk"); + master->sysclk = devm_clk_get_enabled(&pdev->dev, "sysclk"); if (IS_ERR(master->sysclk)) return PTR_ERR(master->sysclk); @@ -1593,18 +1578,8 @@ static int cdns_i3c_master_probe(struct platform_device *pdev) if (irq < 0) return irq; - ret = clk_prepare_enable(master->pclk); - if (ret) - return ret; - - ret = clk_prepare_enable(master->sysclk); - if (ret) - goto err_disable_pclk; - - if (readl(master->regs + DEV_ID) != DEV_ID_I3C_MASTER) { - ret = -EINVAL; - goto err_disable_sysclk; - } + if (readl(master->regs + DEV_ID) != DEV_ID_I3C_MASTER) + return -EINVAL; spin_lock_init(&master->xferqueue.lock); INIT_LIST_HEAD(&master->xferqueue.list); @@ -1615,7 +1590,7 @@ static int cdns_i3c_master_probe(struct platform_device *pdev) ret = devm_request_irq(&pdev->dev, irq, cdns_i3c_master_interrupt, 0, dev_name(&pdev->dev), master); if (ret) - goto err_disable_sysclk; + return ret; platform_set_drvdata(pdev, master); @@ -1637,29 +1612,15 @@ static int cdns_i3c_master_probe(struct platform_device *pdev) master->ibi.slots = devm_kcalloc(&pdev->dev, master->ibi.num_slots, sizeof(*master->ibi.slots), GFP_KERNEL); - if (!master->ibi.slots) { - ret = -ENOMEM; - goto err_disable_sysclk; - } + if (!master->ibi.slots) + return -ENOMEM; writel(IBIR_THR(1), master->regs + CMD_IBI_THR_CTRL); writel(MST_INT_IBIR_THR, master->regs + MST_IER); writel(DEVS_CTRL_DEV_CLR_ALL, master->regs + DEVS_CTRL); - ret = i3c_master_register(&master->base, &pdev->dev, - &cdns_i3c_master_ops, false); - if (ret) - goto err_disable_sysclk; - - return 0; - -err_disable_sysclk: - clk_disable_unprepare(master->sysclk); - -err_disable_pclk: - clk_disable_unprepare(master->pclk); - - return ret; + return i3c_master_register(&master->base, &pdev->dev, + &cdns_i3c_master_ops, false); } static void cdns_i3c_master_remove(struct platform_device *pdev) @@ -1668,9 +1629,6 @@ static void cdns_i3c_master_remove(struct platform_device *pdev) cancel_work_sync(&master->hj_work); i3c_master_unregister(&master->base); - - clk_disable_unprepare(master->sysclk); - clk_disable_unprepare(master->pclk); } static struct platform_driver cdns_i3c_master = { diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index bc4538694540..60f1175f1f37 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -395,7 +395,7 @@ static int i3c_hci_i2c_xfers(struct i2c_dev_desc *dev, ret = hci->io->queue_xfer(hci, xfer, nxfers); if (ret) goto out; - if (!wait_for_completion_timeout(&done, HZ) && + if (!wait_for_completion_timeout(&done, m->i2c.timeout) && hci->io->dequeue_xfer(hci, xfer, nxfers)) { ret = -ETIME; goto out; diff --git a/drivers/i3c/master/renesas-i3c.c b/drivers/i3c/master/renesas-i3c.c new file mode 100644 index 000000000000..174d3dc5d276 --- /dev/null +++ b/drivers/i3c/master/renesas-i3c.c @@ -0,0 +1,1404 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas I3C Controller driver + * Copyright (C) 2023-25 Renesas Electronics Corp. + * + * TODO: IBI support, HotJoin support, Target support + */ + +#include <linux/bitfield.h> +#include <linux/bitops.h> +#include <linux/clk.h> +#include <linux/completion.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/i2c.h> +#include <linux/i3c/master.h> +#include <linux/interrupt.h> +#include <linux/ioport.h> +#include <linux/iopoll.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/reset.h> +#include <linux/slab.h> +#include "../internals.h" + +#define PRTS 0x00 +#define PRTS_PRTMD BIT(0) + +#define BCTL 0x14 +#define BCTL_INCBA BIT(0) +#define BCTL_HJACKCTL BIT(8) +#define BCTL_ABT BIT(29) +#define BCTL_BUSE BIT(31) + +#define MSDVAD 0x18 +#define MSDVAD_MDYAD(x) FIELD_PREP(GENMASK(21, 16), x) +#define MSDVAD_MDYADV BIT(31) + +#define RSTCTL 0x20 +#define RSTCTL_RI3CRST BIT(0) +#define RSTCTL_INTLRST BIT(16) + +#define INST 0x30 + +#define IBINCTL 0x58 +#define IBINCTL_NRHJCTL BIT(0) +#define IBINCTL_NRMRCTL BIT(1) +#define IBINCTL_NRSIRCTL BIT(3) + +#define SVCTL 0x64 + +#define REFCKCTL 0x70 +#define REFCKCTL_IREFCKS(x) FIELD_PREP(GENMASK(2, 0), x) + +#define STDBR 0x74 +#define STDBR_SBRLO(cond, x) FIELD_PREP(GENMASK(7, 0), (x) >> (cond)) +#define STDBR_SBRHO(cond, x) FIELD_PREP(GENMASK(15, 8), (x) >> (cond)) +#define STDBR_SBRLP(x) FIELD_PREP(GENMASK(21, 16), x) +#define STDBR_SBRHP(x) FIELD_PREP(GENMASK(29, 24), x) +#define STDBR_DSBRPO BIT(31) + +#define EXTBR 0x78 +#define EXTBR_EBRLO(x) FIELD_PREP(GENMASK(7, 0), x) +#define EXTBR_EBRHO(x) FIELD_PREP(GENMASK(15, 8), x) +#define EXTBR_EBRLP(x) FIELD_PREP(GENMASK(21, 16), x) +#define EXTBR_EBRHP(x) FIELD_PREP(GENMASK(29, 24), x) + +#define BFRECDT 0x7c +#define BFRECDT_FRECYC(x) FIELD_PREP(GENMASK(8, 0), x) + +#define BAVLCDT 0x80 +#define BAVLCDT_AVLCYC(x) FIELD_PREP(GENMASK(8, 0), x) + +#define BIDLCDT 0x84 +#define BIDLCDT_IDLCYC(x) FIELD_PREP(GENMASK(17, 0), x) + +#define ACKCTL 0xa0 +#define ACKCTL_ACKT BIT(1) +#define ACKCTL_ACKTWP BIT(2) + +#define SCSTRCTL 0xa4 +#define SCSTRCTL_ACKTWE BIT(0) +#define SCSTRCTL_RWE BIT(1) + +#define SCSTLCTL 0xb0 + +#define CNDCTL 0x140 +#define CNDCTL_STCND BIT(0) +#define CNDCTL_SRCND BIT(1) +#define CNDCTL_SPCND BIT(2) + +#define NCMDQP 0x150 /* Normal Command Queue */ +#define NCMDQP_CMD_ATTR(x) FIELD_PREP(GENMASK(2, 0), x) +#define NCMDQP_IMMED_XFER 0x01 +#define NCMDQP_ADDR_ASSGN 0x02 +#define NCMDQP_TID(x) FIELD_PREP(GENMASK(6, 3), x) +#define NCMDQP_CMD(x) FIELD_PREP(GENMASK(14, 7), x) +#define NCMDQP_CP BIT(15) +#define NCMDQP_DEV_INDEX(x) FIELD_PREP(GENMASK(20, 16), x) +#define NCMDQP_BYTE_CNT(x) FIELD_PREP(GENMASK(25, 23), x) +#define NCMDQP_DEV_COUNT(x) FIELD_PREP(GENMASK(29, 26), x) +#define NCMDQP_MODE(x) FIELD_PREP(GENMASK(28, 26), x) +#define NCMDQP_RNW(x) FIELD_PREP(GENMASK(29, 29), x) +#define NCMDQP_ROC BIT(30) +#define NCMDQP_TOC BIT(31) +#define NCMDQP_DATA_LENGTH(x) FIELD_PREP(GENMASK(31, 16), x) + +#define NRSPQP 0x154 /* Normal Respone Queue */ +#define NRSPQP_NO_ERROR 0 +#define NRSPQP_ERROR_CRC 1 +#define NRSPQP_ERROR_PARITY 2 +#define NRSPQP_ERROR_FRAME 3 +#define NRSPQP_ERROR_IBA_NACK 4 +#define NRSPQP_ERROR_ADDRESS_NACK 5 +#define NRSPQP_ERROR_OVER_UNDER_FLOW 6 +#define NRSPQP_ERROR_TRANSF_ABORT 8 +#define NRSPQP_ERROR_I2C_W_NACK_ERR 9 +#define NRSPQP_ERROR_UNSUPPORTED 10 +#define NRSPQP_DATA_LEN(x) FIELD_GET(GENMASK(15, 0), x) +#define NRSPQP_ERR_STATUS(x) FIELD_GET(GENMASK(31, 28), x) + +#define NTDTBP0 0x158 /* Normal Transfer Data Buffer */ +#define NTDTBP0_DEPTH 16 + +#define NQTHCTL 0x190 +#define NQTHCTL_CMDQTH(x) FIELD_PREP(GENMASK(1, 0), x) +#define NQTHCTL_IBIDSSZ(x) FIELD_PREP(GENMASK(23, 16), x) + +#define NTBTHCTL0 0x194 + +#define NRQTHCTL 0x1c0 + +#define BST 0x1d0 +#define BST_STCNDDF BIT(0) +#define BST_SPCNDDF BIT(1) +#define BST_NACKDF BIT(4) +#define BST_TENDF BIT(8) + +#define BSTE 0x1d4 +#define BSTE_STCNDDE BIT(0) +#define BSTE_SPCNDDE BIT(1) +#define BSTE_NACKDE BIT(4) +#define BSTE_TENDE BIT(8) +#define BSTE_ALE BIT(16) +#define BSTE_TODE BIT(20) +#define BSTE_WUCNDDE BIT(24) +#define BSTE_ALL_FLAG (BSTE_STCNDDE | BSTE_SPCNDDE |\ + BSTE_NACKDE | BSTE_TENDE |\ + BSTE_ALE | BSTE_TODE | BSTE_WUCNDDE) + +#define BIE 0x1d8 +#define BIE_STCNDDIE BIT(0) +#define BIE_SPCNDDIE BIT(1) +#define BIE_NACKDIE BIT(4) +#define BIE_TENDIE BIT(8) + +#define NTST 0x1e0 +#define NTST_TDBEF0 BIT(0) +#define NTST_RDBFF0 BIT(1) +#define NTST_CMDQEF BIT(3) +#define NTST_RSPQFF BIT(4) +#define NTST_TABTF BIT(5) +#define NTST_TEF BIT(9) + +#define NTSTE 0x1e4 +#define NTSTE_TDBEE0 BIT(0) +#define NTSTE_RDBFE0 BIT(1) +#define NTSTE_IBIQEFE BIT(2) +#define NTSTE_CMDQEE BIT(3) +#define NTSTE_RSPQFE BIT(4) +#define NTSTE_TABTE BIT(5) +#define NTSTE_TEE BIT(9) +#define NTSTE_RSQFE BIT(20) +#define NTSTE_ALL_FLAG (NTSTE_TDBEE0 | NTSTE_RDBFE0 |\ + NTSTE_IBIQEFE | NTSTE_CMDQEE |\ + NTSTE_RSPQFE | NTSTE_TABTE |\ + NTSTE_TEE | NTSTE_RSQFE) + +#define NTIE 0x1e8 +#define NTIE_TDBEIE0 BIT(0) +#define NTIE_RDBFIE0 BIT(1) +#define NTIE_IBIQEFIE BIT(2) +#define NTIE_RSPQFIE BIT(4) +#define NTIE_RSQFIE BIT(20) + +#define BCST 0x210 +#define BCST_BFREF BIT(0) + +#define DATBAS(x) (0x224 + 0x8 * (x)) +#define DATBAS_DVSTAD(x) FIELD_PREP(GENMASK(6, 0), x) +#define DATBAS_DVDYAD(x) FIELD_PREP(GENMASK(23, 16), x) + +#define NDBSTLV0 0x398 +#define NDBSTLV0_RDBLV(x) FIELD_GET(GENMASK(15, 8), x) + +#define RENESAS_I3C_MAX_DEVS 8 +#define I2C_INIT_MSG -1 + +enum i3c_internal_state { + I3C_INTERNAL_STATE_DISABLED, + I3C_INTERNAL_STATE_CONTROLLER_IDLE, + I3C_INTERNAL_STATE_CONTROLLER_ENTDAA, + I3C_INTERNAL_STATE_CONTROLLER_SETDASA, + I3C_INTERNAL_STATE_CONTROLLER_WRITE, + I3C_INTERNAL_STATE_CONTROLLER_READ, + I3C_INTERNAL_STATE_CONTROLLER_COMMAND_WRITE, + I3C_INTERNAL_STATE_CONTROLLER_COMMAND_READ, +}; + +enum renesas_i3c_event { + I3C_COMMAND_ADDRESS_ASSIGNMENT, + I3C_WRITE, + I3C_READ, + I3C_COMMAND_WRITE, + I3C_COMMAND_READ, +}; + +struct renesas_i3c_cmd { + u32 cmd0; + u32 len; + const void *tx_buf; + u32 tx_count; + void *rx_buf; + u32 rx_count; + u32 err; + u8 rnw; + /* i2c xfer */ + int i2c_bytes_left; + int i2c_is_last; + u8 *i2c_buf; + const struct i2c_msg *msg; +}; + +struct renesas_i3c_xfer { + struct list_head node; + struct completion comp; + int ret; + bool is_i2c_xfer; + unsigned int ncmds; + struct renesas_i3c_cmd cmds[] __counted_by(ncmds); +}; + +struct renesas_i3c_xferqueue { + struct list_head list; + struct renesas_i3c_xfer *cur; + /* Lock for accessing the xfer queue */ + spinlock_t lock; +}; + +struct renesas_i3c { + struct i3c_master_controller base; + enum i3c_internal_state internal_state; + u16 maxdevs; + u32 free_pos; + u32 i2c_STDBR; + u32 i3c_STDBR; + u8 addrs[RENESAS_I3C_MAX_DEVS]; + struct renesas_i3c_xferqueue xferqueue; + void __iomem *regs; + struct clk *tclk; +}; + +struct renesas_i3c_i2c_dev_data { + u8 index; +}; + +struct renesas_i3c_irq_desc { + const char *name; + irq_handler_t isr; + const char *desc; +}; + +struct renesas_i3c_config { + unsigned int has_pclkrw:1; +}; + +static inline void renesas_i3c_reg_update(void __iomem *reg, u32 mask, u32 val) +{ + u32 data = readl(reg); + + data &= ~mask; + data |= (val & mask); + writel(data, reg); +} + +static inline u32 renesas_readl(void __iomem *base, u32 reg) +{ + return readl(base + reg); +} + +static inline void renesas_writel(void __iomem *base, u32 reg, u32 val) +{ + writel(val, base + reg); +} + +static void renesas_set_bit(void __iomem *base, u32 reg, u32 val) +{ + renesas_i3c_reg_update(base + reg, val, val); +} + +static void renesas_clear_bit(void __iomem *base, u32 reg, u32 val) +{ + renesas_i3c_reg_update(base + reg, val, 0); +} + +static inline struct renesas_i3c *to_renesas_i3c(struct i3c_master_controller *m) +{ + return container_of(m, struct renesas_i3c, base); +} + +static inline u32 datbas_dvdyad_with_parity(u8 addr) +{ + return DATBAS_DVDYAD(addr | (parity8(addr) ? 0 : BIT(7))); +} + +static int renesas_i3c_get_free_pos(struct renesas_i3c *i3c) +{ + if (!(i3c->free_pos & GENMASK(i3c->maxdevs - 1, 0))) + return -ENOSPC; + + return ffs(i3c->free_pos) - 1; +} + +static int renesas_i3c_get_addr_pos(struct renesas_i3c *i3c, u8 addr) +{ + int pos; + + for (pos = 0; pos < i3c->maxdevs; pos++) { + if (addr == i3c->addrs[pos]) + return pos; + } + + return -EINVAL; +} + +static struct renesas_i3c_xfer *renesas_i3c_alloc_xfer(struct renesas_i3c *i3c, + unsigned int ncmds) +{ + struct renesas_i3c_xfer *xfer; + + xfer = kzalloc(struct_size(xfer, cmds, ncmds), GFP_KERNEL); + if (!xfer) + return NULL; + + INIT_LIST_HEAD(&xfer->node); + xfer->ncmds = ncmds; + xfer->ret = -ETIMEDOUT; + + return xfer; +} + +static void renesas_i3c_start_xfer_locked(struct renesas_i3c *i3c) +{ + struct renesas_i3c_xfer *xfer = i3c->xferqueue.cur; + struct renesas_i3c_cmd *cmd; + u32 cmd1; + + if (!xfer) + return; + + cmd = xfer->cmds; + + switch (i3c->internal_state) { + case I3C_INTERNAL_STATE_CONTROLLER_ENTDAA: + case I3C_INTERNAL_STATE_CONTROLLER_SETDASA: + renesas_set_bit(i3c->regs, NTIE, NTIE_RSPQFIE); + renesas_writel(i3c->regs, NCMDQP, cmd->cmd0); + renesas_writel(i3c->regs, NCMDQP, 0); + break; + case I3C_INTERNAL_STATE_CONTROLLER_WRITE: + case I3C_INTERNAL_STATE_CONTROLLER_COMMAND_WRITE: + renesas_set_bit(i3c->regs, NTIE, NTIE_RSPQFIE); + if (cmd->len <= 4) { + cmd->cmd0 |= NCMDQP_CMD_ATTR(NCMDQP_IMMED_XFER); + cmd->cmd0 |= NCMDQP_BYTE_CNT(cmd->len); + cmd->tx_count = cmd->len; + cmd1 = cmd->len == 0 ? 0 : *(u32 *)cmd->tx_buf; + } else { + cmd1 = NCMDQP_DATA_LENGTH(cmd->len); + } + renesas_writel(i3c->regs, NCMDQP, cmd->cmd0); + renesas_writel(i3c->regs, NCMDQP, cmd1); + break; + case I3C_INTERNAL_STATE_CONTROLLER_READ: + case I3C_INTERNAL_STATE_CONTROLLER_COMMAND_READ: + renesas_set_bit(i3c->regs, NTIE, NTIE_RDBFIE0); + cmd1 = NCMDQP_DATA_LENGTH(cmd->len); + renesas_writel(i3c->regs, NCMDQP, cmd->cmd0); + renesas_writel(i3c->regs, NCMDQP, cmd1); + break; + default: + break; + } + + /* Clear the command queue empty flag */ + renesas_clear_bit(i3c->regs, NTST, NTST_CMDQEF); +} + +static void renesas_i3c_dequeue_xfer_locked(struct renesas_i3c *i3c, + struct renesas_i3c_xfer *xfer) +{ + if (i3c->xferqueue.cur == xfer) + i3c->xferqueue.cur = NULL; + else + list_del_init(&xfer->node); +} + +static void renesas_i3c_dequeue_xfer(struct renesas_i3c *i3c, struct renesas_i3c_xfer *xfer) +{ + scoped_guard(spinlock_irqsave, &i3c->xferqueue.lock) + renesas_i3c_dequeue_xfer_locked(i3c, xfer); +} + +static void renesas_i3c_enqueue_xfer(struct renesas_i3c *i3c, struct renesas_i3c_xfer *xfer) +{ + reinit_completion(&xfer->comp); + scoped_guard(spinlock_irqsave, &i3c->xferqueue.lock) { + if (i3c->xferqueue.cur) { + list_add_tail(&xfer->node, &i3c->xferqueue.list); + } else { + i3c->xferqueue.cur = xfer; + if (!xfer->is_i2c_xfer) + renesas_i3c_start_xfer_locked(i3c); + } + } +} + +static void renesas_i3c_wait_xfer(struct renesas_i3c *i3c, struct renesas_i3c_xfer *xfer) +{ + unsigned long time_left; + + renesas_i3c_enqueue_xfer(i3c, xfer); + + time_left = wait_for_completion_timeout(&xfer->comp, msecs_to_jiffies(1000)); + if (!time_left) + renesas_i3c_dequeue_xfer(i3c, xfer); +} + +static void renesas_i3c_set_prts(struct renesas_i3c *i3c, u32 val) +{ + /* Required sequence according to tnrza0140ae */ + renesas_set_bit(i3c->regs, RSTCTL, RSTCTL_INTLRST); + renesas_writel(i3c->regs, PRTS, val); + renesas_clear_bit(i3c->regs, RSTCTL, RSTCTL_INTLRST); +} + +static void renesas_i3c_bus_enable(struct i3c_master_controller *m, bool i3c_mode) +{ + struct renesas_i3c *i3c = to_renesas_i3c(m); + + /* Setup either I3C or I2C protocol */ + if (i3c_mode) { + renesas_i3c_set_prts(i3c, 0); + /* Revisit: INCBA handling, especially after I2C transfers */ + renesas_set_bit(i3c->regs, BCTL, BCTL_HJACKCTL | BCTL_INCBA); + renesas_set_bit(i3c->regs, MSDVAD, MSDVAD_MDYADV); + renesas_writel(i3c->regs, STDBR, i3c->i3c_STDBR); + } else { + renesas_i3c_set_prts(i3c, PRTS_PRTMD); + renesas_writel(i3c->regs, STDBR, i3c->i2c_STDBR); + } + + /* Enable I3C bus */ + renesas_set_bit(i3c->regs, BCTL, BCTL_BUSE); +} + +static int renesas_i3c_reset(struct renesas_i3c *i3c) +{ + u32 val; + + renesas_writel(i3c->regs, BCTL, 0); + renesas_set_bit(i3c->regs, RSTCTL, RSTCTL_RI3CRST); + + return read_poll_timeout(renesas_readl, val, !(val & RSTCTL_RI3CRST), + 0, 1000, false, i3c->regs, RSTCTL); +} + +static int renesas_i3c_bus_init(struct i3c_master_controller *m) +{ + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct i3c_bus *bus = i3c_master_get_bus(m); + struct i3c_device_info info = {}; + struct i2c_timings t; + unsigned long rate; + u32 double_SBR, val; + int cks, pp_high_ticks, pp_low_ticks, i3c_total_ticks; + int od_high_ticks, od_low_ticks, i2c_total_ticks; + int ret; + + rate = clk_get_rate(i3c->tclk); + if (!rate) + return -EINVAL; + + ret = renesas_i3c_reset(i3c); + if (ret) + return ret; + + i2c_total_ticks = DIV_ROUND_UP(rate, bus->scl_rate.i2c); + i3c_total_ticks = DIV_ROUND_UP(rate, bus->scl_rate.i3c); + + i2c_parse_fw_timings(&m->dev, &t, true); + + for (cks = 0; cks < 7; cks++) { + /* SCL low-period calculation in Open-drain mode */ + od_low_ticks = ((i2c_total_ticks * 6) / 10); + + /* SCL clock calculation in Push-Pull mode */ + if (bus->mode == I3C_BUS_MODE_PURE) + pp_high_ticks = ((i3c_total_ticks * 5) / 10); + else + pp_high_ticks = DIV_ROUND_UP(I3C_BUS_THIGH_MIXED_MAX_NS, + NSEC_PER_SEC / rate); + pp_low_ticks = i3c_total_ticks - pp_high_ticks; + + if ((od_low_ticks / 2) <= 0xFF && pp_low_ticks < 0x3F) + break; + + i2c_total_ticks /= 2; + i3c_total_ticks /= 2; + rate /= 2; + } + + /* SCL clock period calculation in Open-drain mode */ + if ((od_low_ticks / 2) > 0xFF || pp_low_ticks > 0x3F) { + dev_err(&m->dev, "invalid speed (i2c-scl = %lu Hz, i3c-scl = %lu Hz). Too slow.\n", + (unsigned long)bus->scl_rate.i2c, (unsigned long)bus->scl_rate.i3c); + return -EINVAL; + } + + /* SCL high-period calculation in Open-drain mode */ + od_high_ticks = i2c_total_ticks - od_low_ticks; + + /* Standard Bit Rate setting */ + double_SBR = od_low_ticks > 0xFF ? 1 : 0; + i3c->i3c_STDBR = (double_SBR ? STDBR_DSBRPO : 0) | + STDBR_SBRLO(double_SBR, od_low_ticks) | + STDBR_SBRHO(double_SBR, od_high_ticks) | + STDBR_SBRLP(pp_low_ticks) | + STDBR_SBRHP(pp_high_ticks); + + od_low_ticks -= t.scl_fall_ns / (NSEC_PER_SEC / rate) + 1; + od_high_ticks -= t.scl_rise_ns / (NSEC_PER_SEC / rate) + 1; + i3c->i2c_STDBR = (double_SBR ? STDBR_DSBRPO : 0) | + STDBR_SBRLO(double_SBR, od_low_ticks) | + STDBR_SBRHO(double_SBR, od_high_ticks) | + STDBR_SBRLP(pp_low_ticks) | + STDBR_SBRHP(pp_high_ticks); + renesas_writel(i3c->regs, STDBR, i3c->i3c_STDBR); + + /* Extended Bit Rate setting */ + renesas_writel(i3c->regs, EXTBR, EXTBR_EBRLO(od_low_ticks) | + EXTBR_EBRHO(od_high_ticks) | + EXTBR_EBRLP(pp_low_ticks) | + EXTBR_EBRHP(pp_high_ticks)); + + renesas_writel(i3c->regs, REFCKCTL, REFCKCTL_IREFCKS(cks)); + + /* Disable Slave Mode */ + renesas_writel(i3c->regs, SVCTL, 0); + + /* Initialize Queue/Buffer threshold */ + renesas_writel(i3c->regs, NQTHCTL, NQTHCTL_IBIDSSZ(6) | + NQTHCTL_CMDQTH(1)); + + /* The only supported configuration is two entries*/ + renesas_writel(i3c->regs, NTBTHCTL0, 0); + /* Interrupt when there is one entry in the queue */ + renesas_writel(i3c->regs, NRQTHCTL, 0); + + /* Enable all Bus/Transfer Status Flags */ + renesas_writel(i3c->regs, BSTE, BSTE_ALL_FLAG); + renesas_writel(i3c->regs, NTSTE, NTSTE_ALL_FLAG); + + /* Interrupt enable settings */ + renesas_writel(i3c->regs, BIE, BIE_NACKDIE | BIE_TENDIE); + renesas_writel(i3c->regs, NTIE, 0); + + /* Clear Status register */ + renesas_writel(i3c->regs, NTST, 0); + renesas_writel(i3c->regs, INST, 0); + renesas_writel(i3c->regs, BST, 0); + + /* Hot-Join Acknowlege setting. */ + renesas_set_bit(i3c->regs, BCTL, BCTL_HJACKCTL); + + renesas_writel(i3c->regs, IBINCTL, IBINCTL_NRHJCTL | IBINCTL_NRMRCTL | + IBINCTL_NRSIRCTL); + + renesas_writel(i3c->regs, SCSTLCTL, 0); + renesas_set_bit(i3c->regs, SCSTRCTL, SCSTRCTL_ACKTWE); + + /* Bus condition timing */ + val = DIV_ROUND_UP(I3C_BUS_TBUF_MIXED_FM_MIN_NS, NSEC_PER_SEC / rate); + renesas_writel(i3c->regs, BFRECDT, BFRECDT_FRECYC(val)); + + val = DIV_ROUND_UP(I3C_BUS_TAVAL_MIN_NS, NSEC_PER_SEC / rate); + renesas_writel(i3c->regs, BAVLCDT, BAVLCDT_AVLCYC(val)); + + val = DIV_ROUND_UP(I3C_BUS_TIDLE_MIN_NS, NSEC_PER_SEC / rate); + renesas_writel(i3c->regs, BIDLCDT, BIDLCDT_IDLCYC(val)); + + ret = i3c_master_get_free_addr(m, 0); + if (ret < 0) + return ret; + + renesas_writel(i3c->regs, MSDVAD, MSDVAD_MDYAD(ret) | MSDVAD_MDYADV); + + memset(&info, 0, sizeof(info)); + info.dyn_addr = ret; + return i3c_master_set_info(&i3c->base, &info); +} + +static void renesas_i3c_bus_cleanup(struct i3c_master_controller *m) +{ + struct renesas_i3c *i3c = to_renesas_i3c(m); + + renesas_i3c_reset(i3c); +} + +static int renesas_i3c_daa(struct i3c_master_controller *m) +{ + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct renesas_i3c_cmd *cmd; + u32 olddevs, newdevs; + u8 last_addr = 0, pos; + int ret; + + struct renesas_i3c_xfer *xfer __free(kfree) = renesas_i3c_alloc_xfer(i3c, 1); + if (!xfer) + return -ENOMEM; + + /* Enable I3C bus. */ + renesas_i3c_bus_enable(m, true); + + olddevs = ~(i3c->free_pos); + i3c->internal_state = I3C_INTERNAL_STATE_CONTROLLER_ENTDAA; + + /* Setting DATBASn registers for target devices. */ + for (pos = 0; pos < i3c->maxdevs; pos++) { + if (olddevs & BIT(pos)) + continue; + + ret = i3c_master_get_free_addr(m, last_addr + 1); + if (ret < 0) + return -ENOSPC; + + i3c->addrs[pos] = ret; + last_addr = ret; + + renesas_writel(i3c->regs, DATBAS(pos), datbas_dvdyad_with_parity(ret)); + } + + init_completion(&xfer->comp); + cmd = xfer->cmds; + cmd->rx_count = 0; + + ret = renesas_i3c_get_free_pos(i3c); + if (ret < 0) + return ret; + + /* + * Setup the command descriptor to start the ENTDAA command + * and starting at the selected device index. + */ + cmd->cmd0 = NCMDQP_CMD_ATTR(NCMDQP_ADDR_ASSGN) | NCMDQP_ROC | + NCMDQP_TID(I3C_COMMAND_ADDRESS_ASSIGNMENT) | + NCMDQP_CMD(I3C_CCC_ENTDAA) | NCMDQP_DEV_INDEX(ret) | + NCMDQP_DEV_COUNT(i3c->maxdevs - ret) | NCMDQP_TOC; + + renesas_i3c_wait_xfer(i3c, xfer); + + newdevs = GENMASK(i3c->maxdevs - cmd->rx_count - 1, 0); + newdevs &= ~olddevs; + + for (pos = 0; pos < i3c->maxdevs; pos++) { + if (newdevs & BIT(pos)) + i3c_master_add_i3c_dev_locked(m, i3c->addrs[pos]); + } + + return ret < 0 ? ret : 0; +} + +static bool renesas_i3c_supports_ccc_cmd(struct i3c_master_controller *m, + const struct i3c_ccc_cmd *cmd) +{ + if (cmd->ndests > 1) + return false; + + switch (cmd->id) { + case I3C_CCC_ENEC(true): + case I3C_CCC_ENEC(false): + case I3C_CCC_DISEC(true): + case I3C_CCC_DISEC(false): + case I3C_CCC_ENTAS(0, true): + case I3C_CCC_ENTAS(1, true): + case I3C_CCC_ENTAS(2, true): + case I3C_CCC_ENTAS(3, true): + case I3C_CCC_ENTAS(0, false): + case I3C_CCC_ENTAS(1, false): + case I3C_CCC_ENTAS(2, false): + case I3C_CCC_ENTAS(3, false): + case I3C_CCC_RSTDAA(true): + case I3C_CCC_RSTDAA(false): + case I3C_CCC_ENTDAA: + case I3C_CCC_DEFSLVS: + case I3C_CCC_SETMWL(true): + case I3C_CCC_SETMWL(false): + case I3C_CCC_SETMRL(true): + case I3C_CCC_SETMRL(false): + case I3C_CCC_ENTTM: + case I3C_CCC_SETDASA: + case I3C_CCC_SETNEWDA: + case I3C_CCC_GETMWL: + case I3C_CCC_GETMRL: + case I3C_CCC_GETPID: + case I3C_CCC_GETBCR: + case I3C_CCC_GETDCR: + case I3C_CCC_GETSTATUS: + case I3C_CCC_GETACCMST: + case I3C_CCC_GETMXDS: + return true; + default: + return false; + } +} + +static int renesas_i3c_send_ccc_cmd(struct i3c_master_controller *m, + struct i3c_ccc_cmd *ccc) +{ + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct renesas_i3c_xfer *xfer; + struct renesas_i3c_cmd *cmd; + int ret, pos = 0; + + if (ccc->id & I3C_CCC_DIRECT) { + pos = renesas_i3c_get_addr_pos(i3c, ccc->dests[0].addr); + if (pos < 0) + return pos; + } + + xfer = renesas_i3c_alloc_xfer(i3c, 1); + if (!xfer) + return -ENOMEM; + + renesas_i3c_bus_enable(m, true); + + init_completion(&xfer->comp); + cmd = xfer->cmds; + cmd->rnw = ccc->rnw; + cmd->cmd0 = 0; + + /* Calculate the command descriptor. */ + switch (ccc->id) { + case I3C_CCC_SETDASA: + renesas_writel(i3c->regs, DATBAS(pos), + DATBAS_DVSTAD(ccc->dests[0].addr) | + DATBAS_DVDYAD(*(u8 *)ccc->dests[0].payload.data >> 1)); + cmd->cmd0 = NCMDQP_CMD_ATTR(NCMDQP_ADDR_ASSGN) | NCMDQP_ROC | + NCMDQP_TID(I3C_COMMAND_ADDRESS_ASSIGNMENT) | + NCMDQP_CMD(I3C_CCC_SETDASA) | NCMDQP_DEV_INDEX(pos) | + NCMDQP_DEV_COUNT(0) | NCMDQP_TOC; + i3c->internal_state = I3C_INTERNAL_STATE_CONTROLLER_SETDASA; + break; + default: + /* Calculate the command descriptor. */ + cmd->cmd0 = NCMDQP_TID(I3C_COMMAND_WRITE) | NCMDQP_MODE(0) | + NCMDQP_RNW(ccc->rnw) | NCMDQP_CMD(ccc->id) | + NCMDQP_ROC | NCMDQP_TOC | NCMDQP_CP | + NCMDQP_DEV_INDEX(pos); + + if (ccc->rnw) { + cmd->rx_buf = ccc->dests[0].payload.data; + cmd->len = ccc->dests[0].payload.len; + cmd->rx_count = 0; + i3c->internal_state = I3C_INTERNAL_STATE_CONTROLLER_COMMAND_READ; + } else { + cmd->tx_buf = ccc->dests[0].payload.data; + cmd->len = ccc->dests[0].payload.len; + cmd->tx_count = 0; + i3c->internal_state = I3C_INTERNAL_STATE_CONTROLLER_COMMAND_WRITE; + } + } + + renesas_i3c_wait_xfer(i3c, xfer); + + ret = xfer->ret; + if (ret) + ccc->err = I3C_ERROR_M2; + + kfree(xfer); + + return ret; +} + +static int renesas_i3c_priv_xfers(struct i3c_dev_desc *dev, struct i3c_priv_xfer *i3c_xfers, + int i3c_nxfers) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct renesas_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + struct renesas_i3c_xfer *xfer; + int i; + + /* Enable I3C bus. */ + renesas_i3c_bus_enable(m, true); + + xfer = renesas_i3c_alloc_xfer(i3c, 1); + if (!xfer) + return -ENOMEM; + + init_completion(&xfer->comp); + + for (i = 0; i < i3c_nxfers; i++) { + struct renesas_i3c_cmd *cmd = xfer->cmds; + + /* Calculate the Transfer Command Descriptor */ + cmd->rnw = i3c_xfers[i].rnw; + cmd->cmd0 = NCMDQP_DEV_INDEX(data->index) | NCMDQP_MODE(0) | + NCMDQP_RNW(cmd->rnw) | NCMDQP_ROC | NCMDQP_TOC; + + if (i3c_xfers[i].rnw) { + cmd->rx_count = 0; + cmd->cmd0 |= NCMDQP_TID(I3C_READ); + cmd->rx_buf = i3c_xfers[i].data.in; + cmd->len = i3c_xfers[i].len; + i3c->internal_state = I3C_INTERNAL_STATE_CONTROLLER_READ; + } else { + cmd->tx_count = 0; + cmd->cmd0 |= NCMDQP_TID(I3C_WRITE); + cmd->tx_buf = i3c_xfers[i].data.out; + cmd->len = i3c_xfers[i].len; + i3c->internal_state = I3C_INTERNAL_STATE_CONTROLLER_WRITE; + } + + if (!i3c_xfers[i].rnw && i3c_xfers[i].len > 4) { + i3c_writel_fifo(i3c->regs + NTDTBP0, cmd->tx_buf, cmd->len); + if (cmd->len > NTDTBP0_DEPTH * sizeof(u32)) + renesas_set_bit(i3c->regs, NTIE, NTIE_TDBEIE0); + } + + renesas_i3c_wait_xfer(i3c, xfer); + } + + return 0; +} + +static int renesas_i3c_attach_i3c_dev(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct renesas_i3c_i2c_dev_data *data; + int pos; + + pos = renesas_i3c_get_free_pos(i3c); + if (pos < 0) + return pos; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->index = pos; + i3c->addrs[pos] = dev->info.dyn_addr ? : dev->info.static_addr; + i3c->free_pos &= ~BIT(pos); + + renesas_writel(i3c->regs, DATBAS(pos), DATBAS_DVSTAD(dev->info.static_addr) | + datbas_dvdyad_with_parity(i3c->addrs[pos])); + i3c_dev_set_master_data(dev, data); + + return 0; +} + +static int renesas_i3c_reattach_i3c_dev(struct i3c_dev_desc *dev, + u8 old_dyn_addr) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct renesas_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + + i3c->addrs[data->index] = dev->info.dyn_addr ? dev->info.dyn_addr : + dev->info.static_addr; + + return 0; +} + +static void renesas_i3c_detach_i3c_dev(struct i3c_dev_desc *dev) +{ + struct renesas_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct renesas_i3c *i3c = to_renesas_i3c(m); + + i3c_dev_set_master_data(dev, NULL); + i3c->addrs[data->index] = 0; + i3c->free_pos |= BIT(data->index); + kfree(data); +} + +static int renesas_i3c_i2c_xfers(struct i2c_dev_desc *dev, + struct i2c_msg *i2c_xfers, + int i2c_nxfers) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct renesas_i3c_cmd *cmd; + u8 start_bit = CNDCTL_STCND; + int i; + + struct renesas_i3c_xfer *xfer __free(kfree) = renesas_i3c_alloc_xfer(i3c, 1); + if (!xfer) + return -ENOMEM; + + if (!i2c_nxfers) + return 0; + + renesas_i3c_bus_enable(m, false); + + init_completion(&xfer->comp); + xfer->is_i2c_xfer = true; + cmd = xfer->cmds; + + if (!(renesas_readl(i3c->regs, BCST) & BCST_BFREF)) { + cmd->err = -EBUSY; + return cmd->err; + } + + renesas_writel(i3c->regs, BST, 0); + + renesas_i3c_enqueue_xfer(i3c, xfer); + + for (i = 0; i < i2c_nxfers; i++) { + cmd->i2c_bytes_left = I2C_INIT_MSG; + cmd->i2c_buf = i2c_xfers[i].buf; + cmd->msg = &i2c_xfers[i]; + cmd->i2c_is_last = (i == i2c_nxfers - 1); + + renesas_set_bit(i3c->regs, BIE, BIE_NACKDIE); + renesas_set_bit(i3c->regs, NTIE, NTIE_TDBEIE0); + renesas_set_bit(i3c->regs, BIE, BIE_STCNDDIE); + + /* Issue Start condition */ + renesas_set_bit(i3c->regs, CNDCTL, start_bit); + + renesas_set_bit(i3c->regs, NTSTE, NTSTE_TDBEE0); + + wait_for_completion_timeout(&xfer->comp, m->i2c.timeout); + + if (cmd->err) + break; + + start_bit = CNDCTL_SRCND; + } + + renesas_i3c_dequeue_xfer(i3c, xfer); + return cmd->err; +} + +static int renesas_i3c_attach_i2c_dev(struct i2c_dev_desc *dev) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct renesas_i3c *i3c = to_renesas_i3c(m); + struct renesas_i3c_i2c_dev_data *data; + int pos; + + pos = renesas_i3c_get_free_pos(i3c); + if (pos < 0) + return pos; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->index = pos; + i3c->addrs[pos] = dev->addr; + i3c->free_pos &= ~BIT(pos); + i2c_dev_set_master_data(dev, data); + + return 0; +} + +static void renesas_i3c_detach_i2c_dev(struct i2c_dev_desc *dev) +{ + struct renesas_i3c_i2c_dev_data *data = i2c_dev_get_master_data(dev); + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct renesas_i3c *i3c = to_renesas_i3c(m); + + i2c_dev_set_master_data(dev, NULL); + i3c->addrs[data->index] = 0; + i3c->free_pos |= BIT(data->index); + kfree(data); +} + +static irqreturn_t renesas_i3c_tx_isr(int irq, void *data) +{ + struct renesas_i3c *i3c = data; + struct renesas_i3c_xfer *xfer; + struct renesas_i3c_cmd *cmd; + u8 val; + + scoped_guard(spinlock, &i3c->xferqueue.lock) { + xfer = i3c->xferqueue.cur; + cmd = xfer->cmds; + + if (xfer->is_i2c_xfer) { + if (!cmd->i2c_bytes_left) + return IRQ_NONE; + + if (cmd->i2c_bytes_left != I2C_INIT_MSG) { + val = *cmd->i2c_buf; + cmd->i2c_buf++; + cmd->i2c_bytes_left--; + renesas_writel(i3c->regs, NTDTBP0, val); + } + + if (cmd->i2c_bytes_left == 0) { + renesas_clear_bit(i3c->regs, NTIE, NTIE_TDBEIE0); + renesas_set_bit(i3c->regs, BIE, BIE_TENDIE); + } + + /* Clear the Transmit Buffer Empty status flag. */ + renesas_clear_bit(i3c->regs, NTST, NTST_TDBEF0); + } else { + i3c_writel_fifo(i3c->regs + NTDTBP0, cmd->tx_buf, cmd->len); + } + } + + return IRQ_HANDLED; +} + +static irqreturn_t renesas_i3c_resp_isr(int irq, void *data) +{ + struct renesas_i3c *i3c = data; + struct renesas_i3c_xfer *xfer; + struct renesas_i3c_cmd *cmd; + u32 resp_descriptor = renesas_readl(i3c->regs, NRSPQP); + u32 bytes_remaining = 0; + u32 ntst, data_len; + int ret = 0; + + scoped_guard(spinlock, &i3c->xferqueue.lock) { + xfer = i3c->xferqueue.cur; + cmd = xfer->cmds; + + /* Clear the Respone Queue Full status flag*/ + renesas_clear_bit(i3c->regs, NTST, NTST_RSPQFF); + + data_len = NRSPQP_DATA_LEN(resp_descriptor); + + switch (i3c->internal_state) { + case I3C_INTERNAL_STATE_CONTROLLER_ENTDAA: + cmd->rx_count = data_len; + break; + case I3C_INTERNAL_STATE_CONTROLLER_WRITE: + case I3C_INTERNAL_STATE_CONTROLLER_COMMAND_WRITE: + /* Disable the transmit IRQ if it hasn't been disabled already. */ + renesas_clear_bit(i3c->regs, NTIE, NTIE_TDBEIE0); + break; + case I3C_INTERNAL_STATE_CONTROLLER_READ: + case I3C_INTERNAL_STATE_CONTROLLER_COMMAND_READ: + if (NDBSTLV0_RDBLV(renesas_readl(i3c->regs, NDBSTLV0)) && !cmd->err) + bytes_remaining = data_len - cmd->rx_count; + + i3c_readl_fifo(i3c->regs + NTDTBP0, cmd->rx_buf, bytes_remaining); + renesas_clear_bit(i3c->regs, NTIE, NTIE_RDBFIE0); + break; + default: + break; + } + + switch (NRSPQP_ERR_STATUS(resp_descriptor)) { + case NRSPQP_NO_ERROR: + break; + case NRSPQP_ERROR_PARITY: + case NRSPQP_ERROR_IBA_NACK: + case NRSPQP_ERROR_TRANSF_ABORT: + case NRSPQP_ERROR_CRC: + case NRSPQP_ERROR_FRAME: + ret = -EIO; + break; + case NRSPQP_ERROR_OVER_UNDER_FLOW: + ret = -ENOSPC; + break; + case NRSPQP_ERROR_UNSUPPORTED: + ret = -EOPNOTSUPP; + break; + case NRSPQP_ERROR_I2C_W_NACK_ERR: + case NRSPQP_ERROR_ADDRESS_NACK: + default: + ret = -EINVAL; + break; + } + + /* + * If the transfer was aborted, then the abort flag must be cleared + * before notifying the application that a transfer has completed. + */ + ntst = renesas_readl(i3c->regs, NTST); + if (ntst & NTST_TABTF) + renesas_clear_bit(i3c->regs, BCTL, BCTL_ABT); + + /* Clear error status flags. */ + renesas_clear_bit(i3c->regs, NTST, NTST_TEF | NTST_TABTF); + + xfer->ret = ret; + complete(&xfer->comp); + + xfer = list_first_entry_or_null(&i3c->xferqueue.list, + struct renesas_i3c_xfer, node); + if (xfer) + list_del_init(&xfer->node); + + i3c->xferqueue.cur = xfer; + } + + return IRQ_HANDLED; +} + +static irqreturn_t renesas_i3c_tend_isr(int irq, void *data) +{ + struct renesas_i3c *i3c = data; + struct renesas_i3c_xfer *xfer; + struct renesas_i3c_cmd *cmd; + + scoped_guard(spinlock, &i3c->xferqueue.lock) { + xfer = i3c->xferqueue.cur; + cmd = xfer->cmds; + + if (xfer->is_i2c_xfer) { + if (renesas_readl(i3c->regs, BST) & BST_NACKDF) { + /* We got a NACKIE */ + renesas_readl(i3c->regs, NTDTBP0); /* dummy read */ + renesas_clear_bit(i3c->regs, BST, BST_NACKDF); + cmd->err = -ENXIO; + } else if (cmd->i2c_bytes_left) { + renesas_set_bit(i3c->regs, NTIE, NTIE_TDBEIE0); + return IRQ_NONE; + } + + if (cmd->i2c_is_last || cmd->err) { + renesas_clear_bit(i3c->regs, BIE, BIE_TENDIE); + renesas_set_bit(i3c->regs, BIE, BIE_SPCNDDIE); + renesas_set_bit(i3c->regs, CNDCTL, CNDCTL_SPCND); + } else { + /* Transfer is complete, but do not send STOP */ + renesas_clear_bit(i3c->regs, NTSTE, NTSTE_TDBEE0); + renesas_clear_bit(i3c->regs, BIE, BIE_TENDIE); + xfer->ret = 0; + complete(&xfer->comp); + } + } + + /* Clear the Transmit Buffer Empty status flag. */ + renesas_clear_bit(i3c->regs, BST, BST_TENDF); + } + + return IRQ_HANDLED; +} + +static irqreturn_t renesas_i3c_rx_isr(int irq, void *data) +{ + struct renesas_i3c *i3c = data; + struct renesas_i3c_xfer *xfer; + struct renesas_i3c_cmd *cmd; + int read_bytes; + + /* If resp_isr already read the data and updated 'xfer', we can just leave */ + if (!(renesas_readl(i3c->regs, NTIE) & NTIE_RDBFIE0)) + return IRQ_NONE; + + scoped_guard(spinlock, &i3c->xferqueue.lock) { + xfer = i3c->xferqueue.cur; + cmd = xfer->cmds; + + if (xfer->is_i2c_xfer) { + if (!cmd->i2c_bytes_left) + return IRQ_NONE; + + if (cmd->i2c_bytes_left == I2C_INIT_MSG) { + cmd->i2c_bytes_left = cmd->msg->len; + renesas_set_bit(i3c->regs, SCSTRCTL, SCSTRCTL_RWE); + renesas_readl(i3c->regs, NTDTBP0); /* dummy read */ + if (cmd->i2c_bytes_left == 1) + renesas_writel(i3c->regs, ACKCTL, ACKCTL_ACKT | ACKCTL_ACKTWP); + return IRQ_HANDLED; + } + + if (cmd->i2c_bytes_left == 1) { + /* STOP must come before we set ACKCTL! */ + if (cmd->i2c_is_last) { + renesas_set_bit(i3c->regs, BIE, BIE_SPCNDDIE); + renesas_clear_bit(i3c->regs, BST, BST_SPCNDDF); + renesas_set_bit(i3c->regs, CNDCTL, CNDCTL_SPCND); + } + renesas_writel(i3c->regs, ACKCTL, ACKCTL_ACKT | ACKCTL_ACKTWP); + } else { + renesas_writel(i3c->regs, ACKCTL, ACKCTL_ACKTWP); + } + + /* Reading acks the RIE interrupt */ + *cmd->i2c_buf = renesas_readl(i3c->regs, NTDTBP0); + cmd->i2c_buf++; + cmd->i2c_bytes_left--; + } else { + read_bytes = NDBSTLV0_RDBLV(renesas_readl(i3c->regs, NDBSTLV0)) * sizeof(u32); + i3c_readl_fifo(i3c->regs + NTDTBP0, cmd->rx_buf, read_bytes); + cmd->rx_count = read_bytes; + } + + /* Clear the Read Buffer Full status flag. */ + renesas_clear_bit(i3c->regs, NTST, NTST_RDBFF0); + } + + return IRQ_HANDLED; +} + +static irqreturn_t renesas_i3c_stop_isr(int irq, void *data) +{ + struct renesas_i3c *i3c = data; + struct renesas_i3c_xfer *xfer; + + scoped_guard(spinlock, &i3c->xferqueue.lock) { + xfer = i3c->xferqueue.cur; + + /* read back registers to confirm writes have fully propagated */ + renesas_writel(i3c->regs, BST, 0); + renesas_readl(i3c->regs, BST); + renesas_writel(i3c->regs, BIE, 0); + renesas_clear_bit(i3c->regs, NTST, NTST_TDBEF0 | NTST_RDBFF0); + renesas_clear_bit(i3c->regs, SCSTRCTL, SCSTRCTL_RWE); + + xfer->ret = 0; + complete(&xfer->comp); + } + + return IRQ_HANDLED; +} + +static irqreturn_t renesas_i3c_start_isr(int irq, void *data) +{ + struct renesas_i3c *i3c = data; + struct renesas_i3c_xfer *xfer; + struct renesas_i3c_cmd *cmd; + u8 val; + + scoped_guard(spinlock, &i3c->xferqueue.lock) { + xfer = i3c->xferqueue.cur; + cmd = xfer->cmds; + + if (xfer->is_i2c_xfer) { + if (!cmd->i2c_bytes_left) + return IRQ_NONE; + + if (cmd->i2c_bytes_left == I2C_INIT_MSG) { + if (cmd->msg->flags & I2C_M_RD) { + /* On read, switch over to receive interrupt */ + renesas_clear_bit(i3c->regs, NTIE, NTIE_TDBEIE0); + renesas_set_bit(i3c->regs, NTIE, NTIE_RDBFIE0); + } else { + /* On write, initialize length */ + cmd->i2c_bytes_left = cmd->msg->len; + } + + val = i2c_8bit_addr_from_msg(cmd->msg); + renesas_writel(i3c->regs, NTDTBP0, val); + } + } + + renesas_clear_bit(i3c->regs, BIE, BIE_STCNDDIE); + renesas_clear_bit(i3c->regs, BST, BST_STCNDDF); + } + + return IRQ_HANDLED; +} + +static const struct i3c_master_controller_ops renesas_i3c_ops = { + .bus_init = renesas_i3c_bus_init, + .bus_cleanup = renesas_i3c_bus_cleanup, + .attach_i3c_dev = renesas_i3c_attach_i3c_dev, + .reattach_i3c_dev = renesas_i3c_reattach_i3c_dev, + .detach_i3c_dev = renesas_i3c_detach_i3c_dev, + .do_daa = renesas_i3c_daa, + .supports_ccc_cmd = renesas_i3c_supports_ccc_cmd, + .send_ccc_cmd = renesas_i3c_send_ccc_cmd, + .priv_xfers = renesas_i3c_priv_xfers, + .attach_i2c_dev = renesas_i3c_attach_i2c_dev, + .detach_i2c_dev = renesas_i3c_detach_i2c_dev, + .i2c_xfers = renesas_i3c_i2c_xfers, +}; + +static const struct renesas_i3c_irq_desc renesas_i3c_irqs[] = { + { .name = "resp", .isr = renesas_i3c_resp_isr, .desc = "i3c-resp" }, + { .name = "rx", .isr = renesas_i3c_rx_isr, .desc = "i3c-rx" }, + { .name = "tx", .isr = renesas_i3c_tx_isr, .desc = "i3c-tx" }, + { .name = "st", .isr = renesas_i3c_start_isr, .desc = "i3c-start" }, + { .name = "sp", .isr = renesas_i3c_stop_isr, .desc = "i3c-stop" }, + { .name = "tend", .isr = renesas_i3c_tend_isr, .desc = "i3c-tend" }, + { .name = "nack", .isr = renesas_i3c_tend_isr, .desc = "i3c-nack" }, +}; + +static int renesas_i3c_probe(struct platform_device *pdev) +{ + struct renesas_i3c *i3c; + struct reset_control *reset; + struct clk *clk; + const struct renesas_i3c_config *config = of_device_get_match_data(&pdev->dev); + int ret, i; + + if (!config) + return -ENODATA; + + i3c = devm_kzalloc(&pdev->dev, sizeof(*i3c), GFP_KERNEL); + if (!i3c) + return -ENOMEM; + + i3c->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(i3c->regs)) + return PTR_ERR(i3c->regs); + + clk = devm_clk_get_enabled(&pdev->dev, "pclk"); + if (IS_ERR(clk)) + return PTR_ERR(clk); + + if (config->has_pclkrw) { + clk = devm_clk_get_enabled(&pdev->dev, "pclkrw"); + if (IS_ERR(clk)) + return PTR_ERR(clk); + } + + i3c->tclk = devm_clk_get_enabled(&pdev->dev, "tclk"); + if (IS_ERR(i3c->tclk)) + return PTR_ERR(i3c->tclk); + + reset = devm_reset_control_get_optional_exclusive_deasserted(&pdev->dev, "tresetn"); + if (IS_ERR(reset)) + return dev_err_probe(&pdev->dev, PTR_ERR(reset), + "Error: missing tresetn ctrl\n"); + + reset = devm_reset_control_get_optional_exclusive_deasserted(&pdev->dev, "presetn"); + if (IS_ERR(reset)) + return dev_err_probe(&pdev->dev, PTR_ERR(reset), + "Error: missing presetn ctrl\n"); + + spin_lock_init(&i3c->xferqueue.lock); + INIT_LIST_HEAD(&i3c->xferqueue.list); + + ret = renesas_i3c_reset(i3c); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(renesas_i3c_irqs); i++) { + ret = platform_get_irq_byname(pdev, renesas_i3c_irqs[i].name); + if (ret < 0) + return ret; + + ret = devm_request_irq(&pdev->dev, ret, renesas_i3c_irqs[i].isr, + 0, renesas_i3c_irqs[i].desc, i3c); + if (ret) + return ret; + } + + platform_set_drvdata(pdev, i3c); + + i3c->maxdevs = RENESAS_I3C_MAX_DEVS; + i3c->free_pos = GENMASK(i3c->maxdevs - 1, 0); + + return i3c_master_register(&i3c->base, &pdev->dev, &renesas_i3c_ops, false); +} + +static void renesas_i3c_remove(struct platform_device *pdev) +{ + struct renesas_i3c *i3c = platform_get_drvdata(pdev); + + i3c_master_unregister(&i3c->base); +} + +static const struct renesas_i3c_config empty_i3c_config = { +}; + +static const struct renesas_i3c_config r9a09g047_i3c_config = { + .has_pclkrw = 1, +}; + +static const struct of_device_id renesas_i3c_of_ids[] = { + { .compatible = "renesas,r9a08g045-i3c", .data = &empty_i3c_config }, + { .compatible = "renesas,r9a09g047-i3c", .data = &r9a09g047_i3c_config }, + { /* sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, renesas_i3c_of_ids); + +static struct platform_driver renesas_i3c = { + .probe = renesas_i3c_probe, + .remove = renesas_i3c_remove, + .driver = { + .name = "renesas-i3c", + .of_match_table = renesas_i3c_of_ids, + }, +}; +module_platform_driver(renesas_i3c); + +MODULE_AUTHOR("Wolfram Sang <wsa+renesas@sang-engineering.com>"); +MODULE_AUTHOR("Renesas BSP teams"); +MODULE_DESCRIPTION("Renesas I3C controller driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 7e1a7cb94b43..701ae165b25b 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -104,6 +104,7 @@ #define SVC_I3C_MDATACTRL_TXTRIG_FIFO_NOT_FULL GENMASK(5, 4) #define SVC_I3C_MDATACTRL_RXTRIG_FIFO_NOT_EMPTY 0 #define SVC_I3C_MDATACTRL_RXCOUNT(x) FIELD_GET(GENMASK(28, 24), (x)) +#define SVC_I3C_MDATACTRL_TXCOUNT(x) FIELD_GET(GENMASK(20, 16), (x)) #define SVC_I3C_MDATACTRL_TXFULL BIT(30) #define SVC_I3C_MDATACTRL_RXEMPTY BIT(31) @@ -664,7 +665,6 @@ static int svc_i3c_master_set_speed(struct i3c_master_controller *m, } rpm_out: - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; @@ -779,7 +779,6 @@ static int svc_i3c_master_bus_init(struct i3c_master_controller *m) goto rpm_out; rpm_out: - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; @@ -801,7 +800,6 @@ static void svc_i3c_master_bus_cleanup(struct i3c_master_controller *m) /* Disable master */ writel(0, master->regs + SVC_I3C_MCONFIG); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); } @@ -1207,7 +1205,6 @@ static int svc_i3c_master_do_daa(struct i3c_master_controller *m) dev_err(master->dev, "Cannot handle such a list of devices"); rpm_out: - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; @@ -1304,14 +1301,19 @@ static int svc_i3c_master_xfer(struct svc_i3c_master *master, * FIFO start filling as soon as possible after EmitStartAddr. */ if (svc_has_quirk(master, SVC_I3C_QUIRK_FIFO_EMPTY) && !rnw && xfer_len) { - u32 end = xfer_len > SVC_I3C_FIFO_SIZE ? 0 : SVC_I3C_MWDATAB_END; - u32 len = min_t(u32, xfer_len, SVC_I3C_FIFO_SIZE); - - writesb(master->regs + SVC_I3C_MWDATAB1, out, len - 1); - /* Mark END bit if this is the last byte */ - writel(out[len - 1] | end, master->regs + SVC_I3C_MWDATAB); - xfer_len -= len; - out += len; + u32 space, end, len; + + reg = readl(master->regs + SVC_I3C_MDATACTRL); + space = SVC_I3C_FIFO_SIZE - SVC_I3C_MDATACTRL_TXCOUNT(reg); + if (space) { + end = xfer_len > space ? 0 : SVC_I3C_MWDATAB_END; + len = min_t(u32, xfer_len, space); + writesb(master->regs + SVC_I3C_MWDATAB1, out, len - 1); + /* Mark END bit if this is the last byte */ + writel(out[len - 1] | end, master->regs + SVC_I3C_MWDATAB); + xfer_len -= len; + out += len; + } } ret = readl_poll_timeout(master->regs + SVC_I3C_MSTATUS, reg, @@ -1511,7 +1513,6 @@ static void svc_i3c_master_enqueue_xfer(struct svc_i3c_master *master, } spin_unlock_irqrestore(&master->xferqueue.lock, flags); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); } @@ -1708,7 +1709,7 @@ static int svc_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, mutex_lock(&master->lock); svc_i3c_master_enqueue_xfer(master, xfer); - if (!wait_for_completion_timeout(&xfer->comp, msecs_to_jiffies(1000))) + if (!wait_for_completion_timeout(&xfer->comp, m->i2c.timeout)) svc_i3c_master_dequeue_xfer(master, xfer); mutex_unlock(&master->lock); @@ -1801,7 +1802,6 @@ static int svc_i3c_master_disable_ibi(struct i3c_dev_desc *dev) ret = i3c_master_disec_locked(m, dev->info.dyn_addr, I3C_CCC_EVENT_SIR); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return ret; @@ -1834,7 +1834,6 @@ static int svc_i3c_master_disable_hotjoin(struct i3c_master_controller *m) if (!master->enabled_events) svc_i3c_master_disable_interrupts(master); - pm_runtime_mark_last_busy(master->dev); pm_runtime_put_autosuspend(master->dev); return 0; @@ -1954,7 +1953,6 @@ static int svc_i3c_master_probe(struct platform_device *pdev) if (ret) goto rpm_disable; - pm_runtime_mark_last_busy(&pdev->dev); pm_runtime_put_autosuspend(&pdev->dev); return 0; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index c711db6f8f5c..cf17fd46e255 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -215,16 +215,19 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (test_bit(DROP_WRITES, &fc->flags) && - (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) || + fc->random_write_corrupt)) { ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; return -EINVAL; } else if (test_bit(ERROR_WRITES, &fc->flags) && - (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) || + fc->random_write_corrupt)) { ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; return -EINVAL; } else if (test_bit(ERROR_READS, &fc->flags) && - (fc->corrupt_bio_rw == READ || fc->random_read_corrupt)) { + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == READ) || + fc->random_read_corrupt)) { ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set"; return -EINVAL; } diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index b90f34259fbb..8b50c908c6f4 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -241,10 +241,11 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl /* * First retrieve the target metadata. */ - scnprintf(target_metadata_buf, DM_IMA_TARGET_METADATA_BUF_LEN, - "target_index=%d,target_begin=%llu,target_len=%llu,", - i, ti->begin, ti->len); - target_metadata_buf_len = strlen(target_metadata_buf); + target_metadata_buf_len = + scnprintf(target_metadata_buf, + DM_IMA_TARGET_METADATA_BUF_LEN, + "target_index=%d,target_begin=%llu,target_len=%llu,", + i, ti->begin, ti->len); /* * Then retrieve the actual target data. @@ -448,11 +449,9 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) if (r) goto error; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;device_resume=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); - + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_resume=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } capacity_len = strlen(capacity_str); @@ -561,10 +560,9 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;device_remove=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_remove=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } memcpy(device_table_data + l, remove_all_str, remove_all_len); @@ -647,10 +645,9 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error2; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;table_clear=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;table_clear=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } capacity_len = strlen(capacity_str); @@ -706,7 +703,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL; char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL; bool noio = true; - int r; + int r, len; if (dm_ima_alloc_and_copy_device_data(md, &new_device_data, md->ima.active_table.num_targets, noio)) @@ -728,12 +725,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) md->ima.active_table.device_metadata = new_device_data; md->ima.active_table.device_metadata_len = strlen(new_device_data); - scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2, - "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data, - new_dev_name, new_dev_uuid, capacity_str); + len = scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2, + "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data, + new_dev_name, new_dev_uuid, capacity_str); - dm_ima_measure_data("dm_device_rename", combined_device_data, strlen(combined_device_data), - noio); + dm_ima_measure_data("dm_device_rename", combined_device_data, len, noio); goto exit; diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c index 3e4cb81ce512..d0b883fabfeb 100644 --- a/drivers/md/dm-path-selector.c +++ b/drivers/md/dm-path-selector.c @@ -117,16 +117,16 @@ int dm_register_path_selector(struct path_selector_type *pst) } EXPORT_SYMBOL_GPL(dm_register_path_selector); -int dm_unregister_path_selector(struct path_selector_type *pst) +void dm_unregister_path_selector(struct path_selector_type *pst) { struct ps_internal *psi; down_write(&_ps_lock); psi = __find_path_selector_type(pst->name); - if (!psi) { + if (WARN_ON(!psi)) { up_write(&_ps_lock); - return -EINVAL; + return; } list_del(&psi->list); @@ -134,7 +134,5 @@ int dm_unregister_path_selector(struct path_selector_type *pst) up_write(&_ps_lock); kfree(psi); - - return 0; } EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 3861b2d8b963..7b2270532e64 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -96,7 +96,7 @@ struct path_selector_type { int dm_register_path_selector(struct path_selector_type *type); /* Unregister a path selector */ -int dm_unregister_path_selector(struct path_selector_type *type); +void dm_unregister_path_selector(struct path_selector_type *type); /* Returns a registered path selector type */ struct path_selector_type *dm_get_path_selector(const char *name); diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index b49e10d76d03..f07e773d9cc0 100644 --- a/drivers/md/dm-ps-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c @@ -541,8 +541,10 @@ static int __init dm_hst_init(void) { int r = dm_register_path_selector(&hst_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " HST_VERSION " loaded"); @@ -551,10 +553,7 @@ static int __init dm_hst_init(void) static void __exit dm_hst_exit(void) { - int r = dm_unregister_path_selector(&hst_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&hst_ps); } module_init(dm_hst_init); diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c index 716807e511ee..80415a045c68 100644 --- a/drivers/md/dm-ps-io-affinity.c +++ b/drivers/md/dm-ps-io-affinity.c @@ -260,10 +260,7 @@ static int __init dm_ioa_init(void) static void __exit dm_ioa_exit(void) { - int ret = dm_unregister_path_selector(&ioa_ps); - - if (ret < 0) - DMERR("unregister failed %d", ret); + dm_unregister_path_selector(&ioa_ps); } module_init(dm_ioa_init); diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c index e305f05ad1e5..9c68701ed7a4 100644 --- a/drivers/md/dm-ps-queue-length.c +++ b/drivers/md/dm-ps-queue-length.c @@ -260,8 +260,10 @@ static int __init dm_ql_init(void) { int r = dm_register_path_selector(&ql_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " QL_VERSION " loaded"); @@ -270,10 +272,7 @@ static int __init dm_ql_init(void) static void __exit dm_ql_exit(void) { - int r = dm_unregister_path_selector(&ql_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&ql_ps); } module_init(dm_ql_init); diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c index d1745b123dc1..0c12f4073461 100644 --- a/drivers/md/dm-ps-round-robin.c +++ b/drivers/md/dm-ps-round-robin.c @@ -220,8 +220,10 @@ static int __init dm_rr_init(void) { int r = dm_register_path_selector(&rr_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " RR_VERSION " loaded"); @@ -230,10 +232,7 @@ static int __init dm_rr_init(void) static void __exit dm_rr_exit(void) { - int r = dm_unregister_path_selector(&rr_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&rr_ps); } module_init(dm_rr_init); diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c index 969d31c40272..0543fe7969c4 100644 --- a/drivers/md/dm-ps-service-time.c +++ b/drivers/md/dm-ps-service-time.c @@ -341,8 +341,10 @@ static int __init dm_st_init(void) { int r = dm_register_path_selector(&st_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " ST_VERSION " loaded"); @@ -351,10 +353,7 @@ static int __init dm_st_init(void) static void __exit dm_st_exit(void) { - int r = dm_unregister_path_selector(&st_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&st_ps); } module_init(dm_st_init); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e8c0a8c6fb51..15c538ee9537 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -14,7 +14,6 @@ #include "raid5.h" #include "raid10.h" #include "md-bitmap.h" -#include "dm-core.h" #include <linux/device-mapper.h> @@ -2532,6 +2531,10 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; + /* Respect resynchronization requested with "sync" argument. */ + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + freshest = NULL; rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) @@ -3305,7 +3308,7 @@ size_check: /* Disable/enable discard support on raid set. */ configure_discard_support(rs); - rs->md.dm_gendisk = ti->table->md->disk; + rs->md.dm_gendisk = dm_disk(dm_table_get_md(ti->table)); mddev_unlock(&rs->md); return 0; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index d9d5e6aa5707..ad0a60a07b93 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -899,17 +899,17 @@ static bool dm_table_supports_dax(struct dm_table *t, return true; } -static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_not_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct block_device *bdev = dev->bdev; struct request_queue *q = bdev_get_queue(bdev); /* request-based cannot stack on partitions! */ if (bdev_is_partition(bdev)) - return false; + return true; - return queue_is_mq(q); + return !queue_is_mq(q); } static int dm_table_determine_type(struct dm_table *t) @@ -1005,7 +1005,7 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { + ti->type->iterate_devices(ti, device_is_not_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 05cf4e3f2bbe..007bb93e5fca 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4111,8 +4111,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | - DM_TARGET_IMMUTABLE, - .version = {1, 23, 0}, + DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_CRYPTO, + .version = {1, 24, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4497,7 +4497,8 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 23, 0}, + .features = DM_TARGET_PASSES_CRYPTO, + .version = {1, 24, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c index ae11941c90a9..0613c82bbe8e 100644 --- a/drivers/md/dm-vdo/funnel-workqueue.c +++ b/drivers/md/dm-vdo/funnel-workqueue.c @@ -252,8 +252,7 @@ static void service_work_queue(struct simple_work_queue *queue) * This speeds up some performance tests; that "other work" might include other VDO * threads. */ - if (need_resched()) - cond_resched(); + cond_resched(); } run_finish_hook(queue); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 631a887b487c..d382a390d39a 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -191,7 +191,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, u8 *want_digest, u8 *data) { if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true))) + verity_io_real_digest(v, io)))) return 0; return memcmp(verity_io_real_digest(v, io), want_digest, @@ -392,7 +392,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, /* Always re-validate the corrected block against the expected hash */ r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) return r; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 81186bded1ce..66a00a8ccb39 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -19,7 +19,6 @@ #include "dm-audit.h" #include <linux/module.h> #include <linux/reboot.h> -#include <linux/scatterlist.h> #include <linux/string.h> #include <linux/jump_label.h> #include <linux/security.h> @@ -61,9 +60,6 @@ module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644) static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); -/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ -static DEFINE_STATIC_KEY_FALSE(ahash_enabled); - struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -118,100 +114,21 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } -static int verity_ahash_update(struct dm_verity *v, struct ahash_request *req, - const u8 *data, size_t len, - struct crypto_wait *wait) -{ - struct scatterlist sg; - - if (likely(!is_vmalloc_addr(data))) { - sg_init_one(&sg, data, len); - ahash_request_set_crypt(req, &sg, NULL, len); - return crypto_wait_req(crypto_ahash_update(req), wait); - } - - do { - int r; - size_t this_step = min_t(size_t, len, PAGE_SIZE - offset_in_page(data)); - - flush_kernel_vmap_range((void *)data, this_step); - sg_init_table(&sg, 1); - sg_set_page(&sg, vmalloc_to_page(data), this_step, offset_in_page(data)); - ahash_request_set_crypt(req, &sg, NULL, this_step); - r = crypto_wait_req(crypto_ahash_update(req), wait); - if (unlikely(r)) - return r; - data += this_step; - len -= this_step; - } while (len); - - return 0; -} - -/* - * Wrapper for crypto_ahash_init, which handles verity salting. - */ -static int verity_ahash_init(struct dm_verity *v, struct ahash_request *req, - struct crypto_wait *wait, bool may_sleep) -{ - int r; - - ahash_request_set_tfm(req, v->ahash_tfm); - ahash_request_set_callback(req, - may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, - crypto_req_done, (void *)wait); - crypto_init_wait(wait); - - r = crypto_wait_req(crypto_ahash_init(req), wait); - - if (unlikely(r < 0)) { - if (r != -ENOMEM) - DMERR("crypto_ahash_init failed: %d", r); - return r; - } - - if (likely(v->salt_size && (v->version >= 1))) - r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); - - return r; -} - -static int verity_ahash_final(struct dm_verity *v, struct ahash_request *req, - u8 *digest, struct crypto_wait *wait) -{ - int r; - - if (unlikely(v->salt_size && (!v->version))) { - r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); - - if (r < 0) { - DMERR("%s failed updating salt: %d", __func__, r); - goto out; - } - } - - ahash_request_set_crypt(req, NULL, digest, 0); - r = crypto_wait_req(crypto_ahash_final(req), wait); -out: - return r; -} - int verity_hash(struct dm_verity *v, struct dm_verity_io *io, - const u8 *data, size_t len, u8 *digest, bool may_sleep) + const u8 *data, size_t len, u8 *digest) { + struct shash_desc *desc = &io->hash_desc; int r; - if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) { - struct ahash_request *req = verity_io_hash_req(v, io); - struct crypto_wait wait; - - r = verity_ahash_init(v, req, &wait, may_sleep) ?: - verity_ahash_update(v, req, data, len, &wait) ?: - verity_ahash_final(v, req, digest, &wait); + desc->tfm = v->shash_tfm; + if (unlikely(v->initial_hashstate == NULL)) { + /* Version 0: salt at end */ + r = crypto_shash_init(desc) ?: + crypto_shash_update(desc, data, len) ?: + crypto_shash_update(desc, v->salt, v->salt_size) ?: + crypto_shash_final(desc, digest); } else { - struct shash_desc *desc = verity_io_hash_req(v, io); - - desc->tfm = v->shash_tfm; + /* Version 1: salt at beginning */ r = crypto_shash_import(desc, v->initial_hashstate) ?: crypto_shash_finup(desc, data, len, digest); } @@ -362,7 +279,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits, - verity_io_real_digest(v, io), !io->in_bh); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) goto release_ret_r; @@ -465,7 +382,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, goto free_ret; r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true); + verity_io_real_digest(v, io)); if (unlikely(r)) goto free_ret; @@ -581,7 +498,7 @@ static int verity_verify_io(struct dm_verity_io *io) } r = verity_hash(v, io, data, block_size, - verity_io_real_digest(v, io), !io->in_bh); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) { kunmap_local(data); return r; @@ -1092,12 +1009,7 @@ static void verity_dtr(struct dm_target *ti) kfree(v->zero_digest); verity_free_sig(v); - if (v->ahash_tfm) { - static_branch_dec(&ahash_enabled); - crypto_free_ahash(v->ahash_tfm); - } else { - crypto_free_shash(v->shash_tfm); - } + crypto_free_shash(v->shash_tfm); kfree(v->alg_name); @@ -1157,7 +1069,8 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!v->zero_digest) return r; - io = kmalloc(sizeof(*io) + v->hash_reqsize, GFP_KERNEL); + io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm), + GFP_KERNEL); if (!io) return r; /* verity_dtr will free zero_digest */ @@ -1168,7 +1081,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) goto out; r = verity_hash(v, io, zero_data, 1 << v->data_dev_block_bits, - v->zero_digest, true); + v->zero_digest); out: kfree(io); @@ -1324,9 +1237,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) { struct dm_target *ti = v->ti; - struct crypto_ahash *ahash; - struct crypto_shash *shash = NULL; - const char *driver_name; + struct crypto_shash *shash; v->alg_name = kstrdup(alg_name, GFP_KERNEL); if (!v->alg_name) { @@ -1334,50 +1245,14 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) return -ENOMEM; } - /* - * Allocate the hash transformation object that this dm-verity instance - * will use. The vast majority of dm-verity users use CPU-based - * hashing, so when possible use the shash API to minimize the crypto - * API overhead. If the ahash API resolves to a different driver - * (likely an off-CPU hardware offload), use ahash instead. Also use - * ahash if the obsolete dm-verity format with the appended salt is - * being used, so that quirk only needs to be handled in one place. - */ - ahash = crypto_alloc_ahash(alg_name, 0, - v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); - if (IS_ERR(ahash)) { + shash = crypto_alloc_shash(alg_name, 0, 0); + if (IS_ERR(shash)) { ti->error = "Cannot initialize hash function"; - return PTR_ERR(ahash); - } - driver_name = crypto_ahash_driver_name(ahash); - if (v->version >= 1 /* salt prepended, not appended? */) { - shash = crypto_alloc_shash(alg_name, 0, 0); - if (!IS_ERR(shash) && - strcmp(crypto_shash_driver_name(shash), driver_name) != 0) { - /* - * ahash gave a different driver than shash, so probably - * this is a case of real hardware offload. Use ahash. - */ - crypto_free_shash(shash); - shash = NULL; - } - } - if (!IS_ERR_OR_NULL(shash)) { - crypto_free_ahash(ahash); - ahash = NULL; - v->shash_tfm = shash; - v->digest_size = crypto_shash_digestsize(shash); - v->hash_reqsize = sizeof(struct shash_desc) + - crypto_shash_descsize(shash); - DMINFO("%s using shash \"%s\"", alg_name, driver_name); - } else { - v->ahash_tfm = ahash; - static_branch_inc(&ahash_enabled); - v->digest_size = crypto_ahash_digestsize(ahash); - v->hash_reqsize = sizeof(struct ahash_request) + - crypto_ahash_reqsize(ahash); - DMINFO("%s using ahash \"%s\"", alg_name, driver_name); + return PTR_ERR(shash); } + v->shash_tfm = shash; + v->digest_size = crypto_shash_digestsize(shash); + DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash)); if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { ti->error = "Digest size too big"; return -EINVAL; @@ -1402,7 +1277,7 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) return -EINVAL; } } - if (v->shash_tfm) { + if (v->version) { /* Version 1: salt at beginning */ SHASH_DESC_ON_STACK(desc, v->shash_tfm); int r; @@ -1681,7 +1556,8 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->per_io_data_size = sizeof(struct dm_verity_io) + v->hash_reqsize; + ti->per_io_data_size = sizeof(struct dm_verity_io) + + crypto_shash_descsize(v->shash_tfm); r = verity_fec_ctr(v); if (r) @@ -1788,10 +1664,7 @@ static int verity_preresume(struct dm_target *ti) bdev = dm_disk(dm_table_get_md(ti->table))->part0; root_digest.digest = v->root_digest; root_digest.digest_len = v->digest_size; - if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) - root_digest.alg = crypto_ahash_alg_name(v->ahash_tfm); - else - root_digest.alg = crypto_shash_alg_name(v->shash_tfm); + root_digest.alg = crypto_shash_alg_name(v->shash_tfm); r = security_bdev_setintegrity(bdev, LSM_INT_DMVERITY_ROOTHASH, &root_digest, sizeof(root_digest)); @@ -1817,7 +1690,7 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 11, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 8cbb57862ae1..6d141abd965c 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -39,11 +39,10 @@ struct dm_verity { struct dm_target *ti; struct dm_bufio_client *bufio; char *alg_name; - struct crypto_ahash *ahash_tfm; /* either this or shash_tfm is set */ - struct crypto_shash *shash_tfm; /* either this or ahash_tfm is set */ + struct crypto_shash *shash_tfm; u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ - u8 *initial_hashstate; /* salted initial state, if shash_tfm is set */ + u8 *initial_hashstate; /* salted initial state, if version >= 1 */ u8 *zero_digest; /* digest for a zero block */ #ifdef CONFIG_SECURITY u8 *root_digest_sig; /* signature of the root digest */ @@ -61,7 +60,6 @@ struct dm_verity { bool hash_failed:1; /* set if hash of any block failed */ bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */ unsigned int digest_size; /* digest size for the current hash algorithm */ - unsigned int hash_reqsize; /* the size of temporary space for crypto */ enum verity_mode mode; /* mode for handling verification errors */ enum verity_mode error_mode;/* mode for handling I/O errors */ unsigned int corrupted_errs;/* Number of errors for corrupted blocks */ @@ -100,19 +98,13 @@ struct dm_verity_io { u8 want_digest[HASH_MAX_DIGESTSIZE]; /* - * This struct is followed by a variable-sized hash request of size - * v->hash_reqsize, either a struct ahash_request or a struct shash_desc - * (depending on whether ahash_tfm or shash_tfm is being used). To - * access it, use verity_io_hash_req(). + * Temporary space for hashing. This is variable-length and must be at + * the end of the struct. struct shash_desc is just the fixed part; + * it's followed by a context of size crypto_shash_descsize(shash_tfm). */ + struct shash_desc hash_desc; }; -static inline void *verity_io_hash_req(struct dm_verity *v, - struct dm_verity_io *io) -{ - return io + 1; -} - static inline u8 *verity_io_real_digest(struct dm_verity *v, struct dm_verity_io *io) { @@ -126,7 +118,7 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, } extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io, - const u8 *data, size_t len, u8 *digest, bool may_sleep); + const u8 *data, size_t len, u8 *digest); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 3d31b82e0730..78e17dd4d01b 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -467,8 +467,6 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) bdev_offset_from_zone_start(disk->part0, clone->bi_iter.bi_sector); } - - return; } static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 5da3db06da10..9da329078ea4 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1062,7 +1062,7 @@ static int dmz_iterate_devices(struct dm_target *ti, struct dmz_target *dmz = ti->private; unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); sector_t capacity; - int i, r; + int i, r = 0; for (i = 0; i < dmz->nr_ddevs; i++) { capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2d8402778e5c..a44e8c2dccee 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1024,10 +1024,8 @@ static void dm_wq_requeue_work(struct work_struct *work) * * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io. */ -static void dm_io_complete(struct dm_io *io) +static inline void dm_io_complete(struct dm_io *io) { - bool first_requeue; - /* * Only dm_io that has been split needs two stage requeue, otherwise * we may run into long bio clone chain during suspend and OOM could @@ -1036,12 +1034,7 @@ static void dm_io_complete(struct dm_io *io) * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they * also aren't handled via the first stage requeue. */ - if (dm_io_flagged(io, DM_IO_WAS_SPLIT)) - first_requeue = true; - else - first_requeue = false; - - __dm_io_complete(io, first_requeue); + __dm_io_complete(io, dm_io_flagged(io, DM_IO_WAS_SPLIT)); } /* diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index feab392ab2ee..476e73e502fe 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4610,7 +4610,7 @@ static int mvneta_stop(struct net_device *dev) /* Inform that we are stopping so we don't want to setup the * driver for new CPUs in the notifiers. The code of the * notifier for CPU online is protected by the same spinlock, - * so when we get the lock, the notifer work is done. + * so when we get the lock, the notifier work is done. */ spin_lock(&pp->lock); pp->is_stopped = true; diff --git a/drivers/net/wwan/iosm/iosm_ipc_trace.c b/drivers/net/wwan/iosm/iosm_ipc_trace.c index eeecfa3d10c5..9656254c1c6c 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_trace.c +++ b/drivers/net/wwan/iosm/iosm_ipc_trace.c @@ -51,8 +51,7 @@ static int ipc_trace_remove_buf_file_handler(struct dentry *dentry) } static int ipc_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding) + void *prev_subbuf) { if (relay_buf_full(buf)) { pr_err_ratelimited("Relay_buf full dropping traces"); diff --git a/drivers/net/wwan/t7xx/t7xx_port_trace.c b/drivers/net/wwan/t7xx/t7xx_port_trace.c index 4ed8b4e29bf1..f16d3b01302c 100644 --- a/drivers/net/wwan/t7xx/t7xx_port_trace.c +++ b/drivers/net/wwan/t7xx/t7xx_port_trace.c @@ -33,7 +33,7 @@ static int t7xx_trace_remove_buf_file_handler(struct dentry *dentry) } static int t7xx_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { if (relay_buf_full(buf)) { pr_err_ratelimited("Relay_buf full dropping traces"); diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 573a41869c15..c5345bff9a55 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -3,12 +3,15 @@ * PCI Hotplug Driver for PowerPC PowerNV platform. * * Copyright Gavin Shan, IBM Corporation 2016. + * Copyright (C) 2025 Raptor Engineering, LLC + * Copyright (C) 2025 Raptor Computing Systems, LLC */ #include <linux/bitfield.h> #include <linux/libfdt.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/delay.h> #include <linux/pci_hotplug.h> #include <linux/of_fdt.h> @@ -36,8 +39,10 @@ static void pnv_php_register(struct device_node *dn); static void pnv_php_unregister_one(struct device_node *dn); static void pnv_php_unregister(struct device_node *dn); +static void pnv_php_enable_irq(struct pnv_php_slot *php_slot); + static void pnv_php_disable_irq(struct pnv_php_slot *php_slot, - bool disable_device) + bool disable_device, bool disable_msi) { struct pci_dev *pdev = php_slot->pdev; u16 ctrl; @@ -53,19 +58,15 @@ static void pnv_php_disable_irq(struct pnv_php_slot *php_slot, php_slot->irq = 0; } - if (php_slot->wq) { - destroy_workqueue(php_slot->wq); - php_slot->wq = NULL; - } - - if (disable_device) { + if (disable_device || disable_msi) { if (pdev->msix_enabled) pci_disable_msix(pdev); else if (pdev->msi_enabled) pci_disable_msi(pdev); + } + if (disable_device) pci_disable_device(pdev); - } } static void pnv_php_free_slot(struct kref *kref) @@ -74,7 +75,8 @@ static void pnv_php_free_slot(struct kref *kref) struct pnv_php_slot, kref); WARN_ON(!list_empty(&php_slot->children)); - pnv_php_disable_irq(php_slot, false); + pnv_php_disable_irq(php_slot, false, false); + destroy_workqueue(php_slot->wq); kfree(php_slot->name); kfree(php_slot); } @@ -391,6 +393,20 @@ static int pnv_php_get_power_state(struct hotplug_slot *slot, u8 *state) return 0; } +static int pcie_check_link_active(struct pci_dev *pdev) +{ + u16 lnk_status; + int ret; + + ret = pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnk_status); + if (ret == PCIBIOS_DEVICE_NOT_FOUND || PCI_POSSIBLE_ERROR(lnk_status)) + return -ENODEV; + + ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA); + + return ret; +} + static int pnv_php_get_adapter_state(struct hotplug_slot *slot, u8 *state) { struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); @@ -403,6 +419,19 @@ static int pnv_php_get_adapter_state(struct hotplug_slot *slot, u8 *state) */ ret = pnv_pci_get_presence_state(php_slot->id, &presence); if (ret >= 0) { + if (pci_pcie_type(php_slot->pdev) == PCI_EXP_TYPE_DOWNSTREAM && + presence == OPAL_PCI_SLOT_EMPTY) { + /* + * Similar to pciehp_hpc, check whether the Link Active + * bit is set to account for broken downstream bridges + * that don't properly assert Presence Detect State, as + * was observed on the Microsemi Switchtec PM8533 PFX + * [11f8:8533]. + */ + if (pcie_check_link_active(php_slot->pdev) > 0) + presence = OPAL_PCI_SLOT_PRESENT; + } + *state = presence; ret = 0; } else { @@ -412,10 +441,23 @@ static int pnv_php_get_adapter_state(struct hotplug_slot *slot, u8 *state) return ret; } +static int pnv_php_get_raw_indicator_status(struct hotplug_slot *slot, u8 *state) +{ + struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); + struct pci_dev *bridge = php_slot->pdev; + u16 status; + + pcie_capability_read_word(bridge, PCI_EXP_SLTCTL, &status); + *state = (status & (PCI_EXP_SLTCTL_AIC | PCI_EXP_SLTCTL_PIC)) >> 6; + return 0; +} + + static int pnv_php_get_attention_state(struct hotplug_slot *slot, u8 *state) { struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); + pnv_php_get_raw_indicator_status(slot, &php_slot->attention_state); *state = php_slot->attention_state; return 0; } @@ -433,7 +475,7 @@ static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state) mask = PCI_EXP_SLTCTL_AIC; if (state) - new = PCI_EXP_SLTCTL_ATTN_IND_ON; + new = FIELD_PREP(PCI_EXP_SLTCTL_AIC, state); else new = PCI_EXP_SLTCTL_ATTN_IND_OFF; @@ -442,6 +484,61 @@ static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state) return 0; } +static int pnv_php_activate_slot(struct pnv_php_slot *php_slot, + struct hotplug_slot *slot) +{ + int ret, i; + + /* + * Issue initial slot activation command to firmware + * + * Firmware will power slot on, attempt to train the link, and + * discover any downstream devices. If this process fails, firmware + * will return an error code and an invalid device tree. Failure + * can be caused for multiple reasons, including a faulty + * downstream device, poor connection to the downstream device, or + * a previously latched PHB fence. On failure, issue fundamental + * reset up to three times before aborting. + */ + ret = pnv_php_set_slot_power_state(slot, OPAL_PCI_SLOT_POWER_ON); + if (ret) { + SLOT_WARN( + php_slot, + "PCI slot activation failed with error code %d, possible frozen PHB", + ret); + SLOT_WARN( + php_slot, + "Attempting complete PHB reset before retrying slot activation\n"); + for (i = 0; i < 3; i++) { + /* + * Slot activation failed, PHB may be fenced from a + * prior device failure. + * + * Use the OPAL fundamental reset call to both try a + * device reset and clear any potentially active PHB + * fence / freeze. + */ + SLOT_WARN(php_slot, "Try %d...\n", i + 1); + pci_set_pcie_reset_state(php_slot->pdev, + pcie_warm_reset); + msleep(250); + pci_set_pcie_reset_state(php_slot->pdev, + pcie_deassert_reset); + + ret = pnv_php_set_slot_power_state( + slot, OPAL_PCI_SLOT_POWER_ON); + if (!ret) + break; + } + + if (i >= 3) + SLOT_WARN(php_slot, + "Failed to bring slot online, aborting!\n"); + } + + return ret; +} + static int pnv_php_enable(struct pnv_php_slot *php_slot, bool rescan) { struct hotplug_slot *slot = &php_slot->slot; @@ -504,7 +601,7 @@ static int pnv_php_enable(struct pnv_php_slot *php_slot, bool rescan) goto scan; /* Power is off, turn it on and then scan the slot */ - ret = pnv_php_set_slot_power_state(slot, OPAL_PCI_SLOT_POWER_ON); + ret = pnv_php_activate_slot(php_slot, slot); if (ret) return ret; @@ -561,8 +658,58 @@ static int pnv_php_reset_slot(struct hotplug_slot *slot, bool probe) static int pnv_php_enable_slot(struct hotplug_slot *slot) { struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); + u32 prop32; + int ret; + + ret = pnv_php_enable(php_slot, true); + if (ret) + return ret; + + /* (Re-)enable interrupt if the slot supports surprise hotplug */ + ret = of_property_read_u32(php_slot->dn, "ibm,slot-surprise-pluggable", + &prop32); + if (!ret && prop32) + pnv_php_enable_irq(php_slot); + + return 0; +} + +/* + * Disable any hotplug interrupts for all slots on the provided bus, as well as + * all downstream slots in preparation for a hot unplug. + */ +static int pnv_php_disable_all_irqs(struct pci_bus *bus) +{ + struct pci_bus *child_bus; + struct pci_slot *slot; + + /* First go down child buses */ + list_for_each_entry(child_bus, &bus->children, node) + pnv_php_disable_all_irqs(child_bus); + + /* Disable IRQs for all pnv_php slots on this bus */ + list_for_each_entry(slot, &bus->slots, list) { + struct pnv_php_slot *php_slot = to_pnv_php_slot(slot->hotplug); - return pnv_php_enable(php_slot, true); + pnv_php_disable_irq(php_slot, false, true); + } + + return 0; +} + +/* + * Disable any hotplug interrupts for all downstream slots on the provided + * bus in preparation for a hot unplug. + */ +static int pnv_php_disable_all_downstream_irqs(struct pci_bus *bus) +{ + struct pci_bus *child_bus; + + /* Go down child buses, recursively deactivating their IRQs */ + list_for_each_entry(child_bus, &bus->children, node) + pnv_php_disable_all_irqs(child_bus); + + return 0; } static int pnv_php_disable_slot(struct hotplug_slot *slot) @@ -579,6 +726,13 @@ static int pnv_php_disable_slot(struct hotplug_slot *slot) php_slot->state != PNV_PHP_STATE_REGISTERED) return 0; + /* + * Free all IRQ resources from all child slots before remove. + * Note that we do not disable the root slot IRQ here as that + * would also deactivate the slot hot (re)plug interrupt! + */ + pnv_php_disable_all_downstream_irqs(php_slot->bus); + /* Remove all devices behind the slot */ pci_lock_rescan_remove(); pci_hp_remove_devices(php_slot->bus); @@ -647,6 +801,15 @@ static struct pnv_php_slot *pnv_php_alloc_slot(struct device_node *dn) return NULL; } + /* Allocate workqueue for this slot's interrupt handling */ + php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name); + if (!php_slot->wq) { + SLOT_WARN(php_slot, "Cannot alloc workqueue\n"); + kfree(php_slot->name); + kfree(php_slot); + return NULL; + } + if (dn->child && PCI_DN(dn->child)) php_slot->slot_no = PCI_SLOT(PCI_DN(dn->child)->devfn); else @@ -745,16 +908,63 @@ static int pnv_php_enable_msix(struct pnv_php_slot *php_slot) return entry.vector; } +static void +pnv_php_detect_clear_suprise_removal_freeze(struct pnv_php_slot *php_slot) +{ + struct pci_dev *pdev = php_slot->pdev; + struct eeh_dev *edev; + struct eeh_pe *pe; + int i, rc; + + /* + * When a device is surprise removed from a downstream bridge slot, + * the upstream bridge port can still end up frozen due to related EEH + * events, which will in turn block the MSI interrupts for slot hotplug + * detection. + * + * Detect and thaw any frozen upstream PE after slot deactivation. + */ + edev = pci_dev_to_eeh_dev(pdev); + pe = edev ? edev->pe : NULL; + rc = eeh_pe_get_state(pe); + if ((rc == -ENODEV) || (rc == -ENOENT)) { + SLOT_WARN( + php_slot, + "Upstream bridge PE state unknown, hotplug detect may fail\n"); + } else { + if (pe->state & EEH_PE_ISOLATED) { + SLOT_WARN( + php_slot, + "Upstream bridge PE %02x frozen, thawing...\n", + pe->addr); + for (i = 0; i < 3; i++) + if (!eeh_unfreeze_pe(pe)) + break; + if (i >= 3) + SLOT_WARN( + php_slot, + "Unable to thaw PE %02x, hotplug detect will fail!\n", + pe->addr); + else + SLOT_WARN(php_slot, + "PE %02x thawed successfully\n", + pe->addr); + } + } +} + static void pnv_php_event_handler(struct work_struct *work) { struct pnv_php_event *event = container_of(work, struct pnv_php_event, work); struct pnv_php_slot *php_slot = event->php_slot; - if (event->added) + if (event->added) { pnv_php_enable_slot(&php_slot->slot); - else + } else { pnv_php_disable_slot(&php_slot->slot); + pnv_php_detect_clear_suprise_removal_freeze(php_slot); + } kfree(event); } @@ -843,14 +1053,6 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot, int irq) u16 sts, ctrl; int ret; - /* Allocate workqueue */ - php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name); - if (!php_slot->wq) { - SLOT_WARN(php_slot, "Cannot alloc workqueue\n"); - pnv_php_disable_irq(php_slot, true); - return; - } - /* Check PDC (Presence Detection Change) is broken or not */ ret = of_property_read_u32(php_slot->dn, "ibm,slot-broken-pdc", &broken_pdc); @@ -869,7 +1071,7 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot, int irq) ret = request_irq(irq, pnv_php_interrupt, IRQF_SHARED, php_slot->name, php_slot); if (ret) { - pnv_php_disable_irq(php_slot, true); + pnv_php_disable_irq(php_slot, true, true); SLOT_WARN(php_slot, "Error %d enabling IRQ %d\n", ret, irq); return; } diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 9aec922613ce..64f6e9756aff 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -483,15 +483,6 @@ config RTC_DRV_PCF8523 This driver can also be built as a module. If so, the module will be called rtc-pcf8523. -config RTC_DRV_PCF85063 - tristate "NXP PCF85063" - select REGMAP_I2C - help - If you say yes here you get support for the PCF85063 RTC chip - - This driver can also be built as a module. If so, the module - will be called rtc-pcf85063. - config RTC_DRV_PCF85363 tristate "NXP PCF85363" select REGMAP_I2C @@ -971,6 +962,18 @@ config RTC_DRV_PCF2127 This driver can also be built as a module. If so, the module will be called rtc-pcf2127. +config RTC_DRV_PCF85063 + tristate "NXP PCF85063" + depends on RTC_I2C_AND_SPI + select REGMAP_I2C if I2C + select REGMAP_SPI if SPI_MASTER + help + If you say yes here you get support for the PCF85063 and RV8063 + RTC chips. + + This driver can also be built as a module. If so, the module + will be called rtc-pcf85063. + config RTC_DRV_RV3029C2 tristate "Micro Crystal RV3029/3049" depends on RTC_I2C_AND_SPI diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 4619aa2ac469..789bddfea99d 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -15,7 +15,7 @@ rtc-core-$(CONFIG_RTC_INTF_DEV) += dev.o rtc-core-$(CONFIG_RTC_INTF_PROC) += proc.o rtc-core-$(CONFIG_RTC_INTF_SYSFS) += sysfs.o -obj-$(CONFIG_RTC_LIB_KUNIT_TEST) += lib_test.o +obj-$(CONFIG_RTC_LIB_KUNIT_TEST) += test_rtc_lib.o # Keep the list ordered. diff --git a/drivers/rtc/lib.c b/drivers/rtc/lib.c index 13b5b1f20465..f7051592a6e3 100644 --- a/drivers/rtc/lib.c +++ b/drivers/rtc/lib.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(rtc_year_days); */ void rtc_time64_to_tm(time64_t time, struct rtc_time *tm) { - int days, secs; + int secs; u64 u64tmp; u32 u32tmp, udays, century, day_of_century, year_of_century, year, @@ -59,28 +59,26 @@ void rtc_time64_to_tm(time64_t time, struct rtc_time *tm) bool is_Jan_or_Feb, is_leap_year; /* - * Get days and seconds while preserving the sign to - * handle negative time values (dates before 1970-01-01) + * The time represented by `time` is given in seconds since 1970-01-01 + * (UTC). As the division done below might misbehave for negative + * values, we convert it to seconds since 0000-03-01 and then assume it + * will be non-negative. + * Below we do 4 * udays + 3 which should fit into a 32 bit unsigned + * variable. So the latest date this algorithm works for is 1073741823 + * days after 0000-03-01 which is in the year 2939805. */ - days = div_s64_rem(time, 86400, &secs); + time += (u64)719468 * 86400; + + udays = div_s64_rem(time, 86400, &secs); /* - * We need 0 <= secs < 86400 which isn't given for negative - * values of time. Fixup accordingly. + * day of the week, 0000-03-01 was a Wednesday (in the proleptic + * Gregorian calendar) */ - if (secs < 0) { - days -= 1; - secs += 86400; - } - - /* day of the week, 1970-01-01 was a Thursday */ - tm->tm_wday = (days + 4) % 7; - /* Ensure tm_wday is always positive */ - if (tm->tm_wday < 0) - tm->tm_wday += 7; + tm->tm_wday = (udays + 3) % 7; /* - * The following algorithm is, basically, Proposition 6.3 of Neri + * The following algorithm is, basically, Figure 12 of Neri * and Schneider [1]. In a few words: it works on the computational * (fictitious) calendar where the year starts in March, month = 2 * (*), and finishes in February, month = 13. This calendar is @@ -100,15 +98,15 @@ void rtc_time64_to_tm(time64_t time, struct rtc_time *tm) * (using just arithmetics) it's easy to convert it to the * corresponding date in the Gregorian calendar. * - * [1] "Euclidean Affine Functions and Applications to Calendar - * Algorithms". https://arxiv.org/abs/2102.06959 + * [1] Neri C, Schneider L. Euclidean affine functions and their + * application to calendar algorithms. Softw Pract Exper. + * 2023;53(4):937-970. doi: 10.1002/spe.3172 + * https://doi.org/10.1002/spe.3172 * * (*) The numbering of months follows rtc_time more closely and * thus, is slightly different from [1]. */ - udays = days + 719468; - u32tmp = 4 * udays + 3; century = u32tmp / 146097; day_of_century = u32tmp % 146097 / 4; diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 5efbe69bf5ca..7205c59ff729 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -279,6 +279,13 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t) if (tmp & DS1340_BIT_OSF) return -EINVAL; break; + case ds_1341: + ret = regmap_read(ds1307->regmap, DS1337_REG_STATUS, &tmp); + if (ret) + return ret; + if (tmp & DS1337_BIT_OSF) + return -EINVAL; + break; case ds_1388: ret = regmap_read(ds1307->regmap, DS1388_REG_FLAG, &tmp); if (ret) @@ -377,6 +384,10 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t) regmap_update_bits(ds1307->regmap, DS1340_REG_FLAG, DS1340_BIT_OSF, 0); break; + case ds_1341: + regmap_update_bits(ds1307->regmap, DS1337_REG_STATUS, + DS1337_BIT_OSF, 0); + break; case ds_1388: regmap_update_bits(ds1307->regmap, DS1388_REG_FLAG, DS1388_BIT_OSF, 0); @@ -1456,16 +1467,21 @@ static unsigned long ds3231_clk_sqw_recalc_rate(struct clk_hw *hw, return ds3231_clk_sqw_rates[rate_sel]; } -static long ds3231_clk_sqw_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int ds3231_clk_sqw_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int i; for (i = ARRAY_SIZE(ds3231_clk_sqw_rates) - 1; i >= 0; i--) { - if (ds3231_clk_sqw_rates[i] <= rate) - return ds3231_clk_sqw_rates[i]; + if (ds3231_clk_sqw_rates[i] <= req->rate) { + req->rate = ds3231_clk_sqw_rates[i]; + + return 0; + } } + req->rate = ds3231_clk_sqw_rates[ARRAY_SIZE(ds3231_clk_sqw_rates) - 1]; + return 0; } @@ -1525,7 +1541,7 @@ static const struct clk_ops ds3231_clk_sqw_ops = { .unprepare = ds3231_clk_sqw_unprepare, .is_prepared = ds3231_clk_sqw_is_prepared, .recalc_rate = ds3231_clk_sqw_recalc_rate, - .round_rate = ds3231_clk_sqw_round_rate, + .determine_rate = ds3231_clk_sqw_determine_rate, .set_rate = ds3231_clk_sqw_set_rate, }; @@ -1813,10 +1829,8 @@ static int ds1307_probe(struct i2c_client *client) regmap_write(ds1307->regmap, DS1337_REG_CONTROL, regs[0]); - /* oscillator fault? clear flag, and warn */ + /* oscillator fault? warn */ if (regs[1] & DS1337_BIT_OSF) { - regmap_write(ds1307->regmap, DS1337_REG_STATUS, - regs[1] & ~DS1337_BIT_OSF); dev_warn(ds1307->dev, "SET TIME!\n"); } break; diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c index 38e25f63597a..97423f1d0361 100644 --- a/drivers/rtc/rtc-ds1685.c +++ b/drivers/rtc/rtc-ds1685.c @@ -3,7 +3,7 @@ * An rtc driver for the Dallas/Maxim DS1685/DS1687 and related real-time * chips. * - * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>. + * Copyright (C) 2011-2014 Joshua Kinard <linux@kumba.dev>. * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>. * * References: @@ -1436,7 +1436,7 @@ EXPORT_SYMBOL_GPL(ds1685_rtc_poweroff); /* ----------------------------------------------------------------------- */ -MODULE_AUTHOR("Joshua Kinard <kumba@gentoo.org>"); +MODULE_AUTHOR("Joshua Kinard <linux@kumba.dev>"); MODULE_AUTHOR("Matthias Fuchs <matthias.fuchs@esd-electronics.com>"); MODULE_DESCRIPTION("Dallas/Maxim DS1685/DS1687-series RTC driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-hym8563.c b/drivers/rtc/rtc-hym8563.c index 63f11ea3589d..7a170c0f9710 100644 --- a/drivers/rtc/rtc-hym8563.c +++ b/drivers/rtc/rtc-hym8563.c @@ -285,14 +285,19 @@ static unsigned long hym8563_clkout_recalc_rate(struct clk_hw *hw, return clkout_rates[ret]; } -static long hym8563_clkout_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int hym8563_clkout_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int i; for (i = 0; i < ARRAY_SIZE(clkout_rates); i++) - if (clkout_rates[i] <= rate) - return clkout_rates[i]; + if (clkout_rates[i] <= req->rate) { + req->rate = clkout_rates[i]; + + return 0; + } + + req->rate = clkout_rates[0]; return 0; } @@ -363,7 +368,7 @@ static const struct clk_ops hym8563_clkout_ops = { .unprepare = hym8563_clkout_unprepare, .is_prepared = hym8563_clkout_is_prepared, .recalc_rate = hym8563_clkout_recalc_rate, - .round_rate = hym8563_clkout_round_rate, + .determine_rate = hym8563_clkout_determine_rate, .set_rate = hym8563_clkout_set_rate, }; diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index c568639d2151..740cab013f59 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -72,7 +72,7 @@ static const struct i2c_device_id m41t80_id[] = { { "m41t62", M41T80_FEATURE_SQ | M41T80_FEATURE_SQ_ALT }, - { "m41t65", M41T80_FEATURE_HT | M41T80_FEATURE_WD }, + { "m41t65", M41T80_FEATURE_WD }, { "m41t80", M41T80_FEATURE_SQ }, { "m41t81", M41T80_FEATURE_HT | M41T80_FEATURE_SQ}, { "m41t81s", M41T80_FEATURE_HT | M41T80_FEATURE_BL | M41T80_FEATURE_SQ }, @@ -93,7 +93,7 @@ static const __maybe_unused struct of_device_id m41t80_of_match[] = { }, { .compatible = "st,m41t65", - .data = (void *)(M41T80_FEATURE_HT | M41T80_FEATURE_WD) + .data = (void *)(M41T80_FEATURE_WD) }, { .compatible = "st,m41t80", @@ -484,16 +484,17 @@ static unsigned long m41t80_sqw_recalc_rate(struct clk_hw *hw, return sqw_to_m41t80_data(hw)->freq; } -static long m41t80_sqw_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int m41t80_sqw_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { - if (rate >= M41T80_SQW_MAX_FREQ) - return M41T80_SQW_MAX_FREQ; - if (rate >= M41T80_SQW_MAX_FREQ / 4) - return M41T80_SQW_MAX_FREQ / 4; - if (!rate) - return 0; - return 1 << ilog2(rate); + if (req->rate >= M41T80_SQW_MAX_FREQ) + req->rate = M41T80_SQW_MAX_FREQ; + else if (req->rate >= M41T80_SQW_MAX_FREQ / 4) + req->rate = M41T80_SQW_MAX_FREQ / 4; + else if (req->rate) + req->rate = 1 << ilog2(req->rate); + + return 0; } static int m41t80_sqw_set_rate(struct clk_hw *hw, unsigned long rate, @@ -564,7 +565,7 @@ static const struct clk_ops m41t80_sqw_ops = { .unprepare = m41t80_sqw_unprepare, .is_prepared = m41t80_sqw_is_prepared, .recalc_rate = m41t80_sqw_recalc_rate, - .round_rate = m41t80_sqw_round_rate, + .determine_rate = m41t80_sqw_determine_rate, .set_rate = m41t80_sqw_set_rate, }; diff --git a/drivers/rtc/rtc-max31335.c b/drivers/rtc/rtc-max31335.c index a7bb37aaab9e..dfb5bad3a369 100644 --- a/drivers/rtc/rtc-max31335.c +++ b/drivers/rtc/rtc-max31335.c @@ -497,15 +497,17 @@ static unsigned long max31335_clkout_recalc_rate(struct clk_hw *hw, return max31335_clkout_freq[reg & freq_mask]; } -static long max31335_clkout_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int max31335_clkout_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int index; - index = find_closest(rate, max31335_clkout_freq, + index = find_closest(req->rate, max31335_clkout_freq, ARRAY_SIZE(max31335_clkout_freq)); - return max31335_clkout_freq[index]; + req->rate = max31335_clkout_freq[index]; + + return 0; } static int max31335_clkout_set_rate(struct clk_hw *hw, unsigned long rate, @@ -554,7 +556,7 @@ static int max31335_clkout_is_enabled(struct clk_hw *hw) static const struct clk_ops max31335_clkout_ops = { .recalc_rate = max31335_clkout_recalc_rate, - .round_rate = max31335_clkout_round_rate, + .determine_rate = max31335_clkout_determine_rate, .set_rate = max31335_clkout_set_rate, .enable = max31335_clkout_enable, .disable = max31335_clkout_disable, diff --git a/drivers/rtc/rtc-nct3018y.c b/drivers/rtc/rtc-nct3018y.c index 76c5f464b2da..cd4b1db902e9 100644 --- a/drivers/rtc/rtc-nct3018y.c +++ b/drivers/rtc/rtc-nct3018y.c @@ -367,14 +367,19 @@ static unsigned long nct3018y_clkout_recalc_rate(struct clk_hw *hw, return clkout_rates[flags]; } -static long nct3018y_clkout_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int nct3018y_clkout_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int i; for (i = 0; i < ARRAY_SIZE(clkout_rates); i++) - if (clkout_rates[i] <= rate) - return clkout_rates[i]; + if (clkout_rates[i] <= req->rate) { + req->rate = clkout_rates[i]; + + return 0; + } + + req->rate = clkout_rates[0]; return 0; } @@ -446,7 +451,7 @@ static const struct clk_ops nct3018y_clkout_ops = { .unprepare = nct3018y_clkout_unprepare, .is_prepared = nct3018y_clkout_is_prepared, .recalc_rate = nct3018y_clkout_recalc_rate, - .round_rate = nct3018y_clkout_round_rate, + .determine_rate = nct3018y_clkout_determine_rate, .set_rate = nct3018y_clkout_set_rate, }; diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c index 4fa5c4ecdd5a..f643e0bd7351 100644 --- a/drivers/rtc/rtc-pcf85063.c +++ b/drivers/rtc/rtc-pcf85063.c @@ -17,6 +17,7 @@ #include <linux/of.h> #include <linux/pm_wakeirq.h> #include <linux/regmap.h> +#include <linux/spi/spi.h> /* * Information for this driver was pulled from the following datasheets. @@ -29,6 +30,9 @@ * * https://www.microcrystal.com/fileadmin/Media/Products/RTC/App.Manual/RV-8263-C7_App-Manual.pdf * RV8263 -- Rev. 1.0 — January 2019 + * + * https://www.microcrystal.com/fileadmin/Media/Products/RTC/App.Manual/RV-8063-C7_App-Manual.pdf + * RV8063 -- Rev. 1.1 - October 2018 */ #define PCF85063_REG_CTRL1 0x00 /* status */ @@ -401,14 +405,19 @@ static unsigned long pcf85063_clkout_recalc_rate(struct clk_hw *hw, return clkout_rates[buf]; } -static long pcf85063_clkout_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int pcf85063_clkout_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int i; for (i = 0; i < ARRAY_SIZE(clkout_rates); i++) - if (clkout_rates[i] <= rate) - return clkout_rates[i]; + if (clkout_rates[i] <= req->rate) { + req->rate = clkout_rates[i]; + + return 0; + } + + req->rate = clkout_rates[0]; return 0; } @@ -482,7 +491,7 @@ static const struct clk_ops pcf85063_clkout_ops = { .unprepare = pcf85063_clkout_unprepare, .is_prepared = pcf85063_clkout_is_prepared, .recalc_rate = pcf85063_clkout_recalc_rate, - .round_rate = pcf85063_clkout_round_rate, + .determine_rate = pcf85063_clkout_determine_rate, .set_rate = pcf85063_clkout_set_rate, }; @@ -524,47 +533,12 @@ static struct clk *pcf85063_clkout_register_clk(struct pcf85063 *pcf85063) } #endif -static const struct pcf85063_config config_pcf85063 = { - .regmap = { - .reg_bits = 8, - .val_bits = 8, - .max_register = 0x0a, - }, -}; - -static const struct pcf85063_config config_pcf85063tp = { - .regmap = { - .reg_bits = 8, - .val_bits = 8, - .max_register = 0x0a, - }, -}; - -static const struct pcf85063_config config_pcf85063a = { - .regmap = { - .reg_bits = 8, - .val_bits = 8, - .max_register = 0x11, - }, - .has_alarms = 1, -}; - -static const struct pcf85063_config config_rv8263 = { - .regmap = { - .reg_bits = 8, - .val_bits = 8, - .max_register = 0x11, - }, - .has_alarms = 1, - .force_cap_7000 = 1, -}; - -static int pcf85063_probe(struct i2c_client *client) +static int pcf85063_probe(struct device *dev, struct regmap *regmap, int irq, + const struct pcf85063_config *config) { struct pcf85063 *pcf85063; unsigned int tmp; int err; - const struct pcf85063_config *config; struct nvmem_config nvmem_cfg = { .name = "pcf85063_nvram", .reg_read = pcf85063_nvmem_read, @@ -573,28 +547,22 @@ static int pcf85063_probe(struct i2c_client *client) .size = 1, }; - dev_dbg(&client->dev, "%s\n", __func__); + dev_dbg(dev, "%s\n", __func__); - pcf85063 = devm_kzalloc(&client->dev, sizeof(struct pcf85063), + pcf85063 = devm_kzalloc(dev, sizeof(struct pcf85063), GFP_KERNEL); if (!pcf85063) return -ENOMEM; - config = i2c_get_match_data(client); - if (!config) - return -ENODEV; - - pcf85063->regmap = devm_regmap_init_i2c(client, &config->regmap); - if (IS_ERR(pcf85063->regmap)) - return PTR_ERR(pcf85063->regmap); + pcf85063->regmap = regmap; - i2c_set_clientdata(client, pcf85063); + dev_set_drvdata(dev, pcf85063); err = regmap_read(pcf85063->regmap, PCF85063_REG_SC, &tmp); if (err) - return dev_err_probe(&client->dev, err, "RTC chip is not present\n"); + return dev_err_probe(dev, err, "RTC chip is not present\n"); - pcf85063->rtc = devm_rtc_allocate_device(&client->dev); + pcf85063->rtc = devm_rtc_allocate_device(dev); if (IS_ERR(pcf85063->rtc)) return PTR_ERR(pcf85063->rtc); @@ -605,19 +573,17 @@ static int pcf85063_probe(struct i2c_client *client) * of the registers after the automatic power-on reset... */ if (tmp & PCF85063_REG_SC_OS) { - dev_warn(&client->dev, - "POR issue detected, sending a SW reset\n"); + dev_warn(dev, "POR issue detected, sending a SW reset\n"); err = regmap_write(pcf85063->regmap, PCF85063_REG_CTRL1, PCF85063_REG_CTRL1_SWR); if (err < 0) - dev_warn(&client->dev, - "SW reset failed, trying to continue\n"); + dev_warn(dev, "SW reset failed, trying to continue\n"); } - err = pcf85063_load_capacitance(pcf85063, client->dev.of_node, + err = pcf85063_load_capacitance(pcf85063, dev->of_node, config->force_cap_7000 ? 7000 : 0); if (err < 0) - dev_warn(&client->dev, "failed to set xtal load capacitance: %d", + dev_warn(dev, "failed to set xtal load capacitance: %d", err); pcf85063->rtc->ops = &pcf85063_rtc_ops; @@ -627,13 +593,13 @@ static int pcf85063_probe(struct i2c_client *client) clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, pcf85063->rtc->features); clear_bit(RTC_FEATURE_ALARM, pcf85063->rtc->features); - if (config->has_alarms && client->irq > 0) { + if (config->has_alarms && irq > 0) { unsigned long irqflags = IRQF_TRIGGER_LOW; - if (dev_fwnode(&client->dev)) + if (dev_fwnode(dev)) irqflags = 0; - err = devm_request_threaded_irq(&client->dev, client->irq, + err = devm_request_threaded_irq(dev, irq, NULL, pcf85063_rtc_handle_irq, irqflags | IRQF_ONESHOT, "pcf85063", pcf85063); @@ -642,8 +608,8 @@ static int pcf85063_probe(struct i2c_client *client) "unable to request IRQ, alarms disabled\n"); } else { set_bit(RTC_FEATURE_ALARM, pcf85063->rtc->features); - device_init_wakeup(&client->dev, true); - err = dev_pm_set_wake_irq(&client->dev, client->irq); + device_init_wakeup(dev, true); + err = dev_pm_set_wake_irq(dev, irq); if (err) dev_err(&pcf85063->rtc->dev, "failed to enable irq wake\n"); @@ -661,6 +627,43 @@ static int pcf85063_probe(struct i2c_client *client) return devm_rtc_register_device(pcf85063->rtc); } +#if IS_ENABLED(CONFIG_I2C) + +static const struct pcf85063_config config_pcf85063 = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x0a, + }, +}; + +static const struct pcf85063_config config_pcf85063tp = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x0a, + }, +}; + +static const struct pcf85063_config config_pcf85063a = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x11, + }, + .has_alarms = 1, +}; + +static const struct pcf85063_config config_rv8263 = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x11, + }, + .has_alarms = 1, + .force_cap_7000 = 1, +}; + static const struct i2c_device_id pcf85063_ids[] = { { "pca85073a", .driver_data = (kernel_ulong_t)&config_pcf85063a }, { "pcf85063", .driver_data = (kernel_ulong_t)&config_pcf85063 }, @@ -683,16 +686,146 @@ static const struct of_device_id pcf85063_of_match[] = { MODULE_DEVICE_TABLE(of, pcf85063_of_match); #endif +static int pcf85063_i2c_probe(struct i2c_client *client) +{ + const struct pcf85063_config *config; + struct regmap *regmap; + + config = i2c_get_match_data(client); + if (!config) + return -ENODEV; + + regmap = devm_regmap_init_i2c(client, &config->regmap); + if (IS_ERR(regmap)) + return PTR_ERR(regmap); + + return pcf85063_probe(&client->dev, regmap, client->irq, config); +} + static struct i2c_driver pcf85063_driver = { .driver = { .name = "rtc-pcf85063", .of_match_table = of_match_ptr(pcf85063_of_match), }, - .probe = pcf85063_probe, + .probe = pcf85063_i2c_probe, .id_table = pcf85063_ids, }; -module_i2c_driver(pcf85063_driver); +static int pcf85063_register_driver(void) +{ + return i2c_add_driver(&pcf85063_driver); +} + +static void pcf85063_unregister_driver(void) +{ + i2c_del_driver(&pcf85063_driver); +} + +#else + +static int pcf85063_register_driver(void) +{ + return 0; +} + +static void pcf85063_unregister_driver(void) +{ +} + +#endif /* IS_ENABLED(CONFIG_I2C) */ + +#if IS_ENABLED(CONFIG_SPI_MASTER) + +static const struct pcf85063_config config_rv8063 = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x11, + .read_flag_mask = BIT(7) | BIT(5), + .write_flag_mask = BIT(5), + }, + .has_alarms = 1, + .force_cap_7000 = 1, +}; + +static const struct spi_device_id rv8063_id[] = { + { "rv8063" }, + {} +}; +MODULE_DEVICE_TABLE(spi, rv8063_id); + +static const struct of_device_id rv8063_of_match[] = { + { .compatible = "microcrystal,rv8063" }, + {} +}; +MODULE_DEVICE_TABLE(of, rv8063_of_match); + +static int rv8063_probe(struct spi_device *spi) +{ + const struct pcf85063_config *config = &config_rv8063; + struct regmap *regmap; + + regmap = devm_regmap_init_spi(spi, &config->regmap); + if (IS_ERR(regmap)) + return PTR_ERR(regmap); + + return pcf85063_probe(&spi->dev, regmap, spi->irq, config); +} + +static struct spi_driver rv8063_driver = { + .driver = { + .name = "rv8063", + .of_match_table = rv8063_of_match, + }, + .probe = rv8063_probe, + .id_table = rv8063_id, +}; + +static int __init rv8063_register_driver(void) +{ + return spi_register_driver(&rv8063_driver); +} + +static void __exit rv8063_unregister_driver(void) +{ + spi_unregister_driver(&rv8063_driver); +} + +#else + +static int __init rv8063_register_driver(void) +{ + return 0; +} + +static void __exit rv8063_unregister_driver(void) +{ +} + +#endif /* IS_ENABLED(CONFIG_SPI_MASTER) */ + +static int __init pcf85063_init(void) +{ + int ret; + + ret = pcf85063_register_driver(); + if (ret) + return ret; + + ret = rv8063_register_driver(); + if (ret) + pcf85063_unregister_driver(); + + return ret; +} +module_init(pcf85063_init); + +static void __exit pcf85063_exit(void) +{ + rv8063_unregister_driver(); + pcf85063_unregister_driver(); +} +module_exit(pcf85063_exit); MODULE_AUTHOR("Søren Andersen <san@rosetechnology.dk>"); MODULE_DESCRIPTION("PCF85063 RTC driver"); diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c index b2611697fa5e..4e61011fb7a9 100644 --- a/drivers/rtc/rtc-pcf8563.c +++ b/drivers/rtc/rtc-pcf8563.c @@ -330,14 +330,19 @@ static unsigned long pcf8563_clkout_recalc_rate(struct clk_hw *hw, return clkout_rates[buf]; } -static long pcf8563_clkout_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int pcf8563_clkout_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int i; for (i = 0; i < ARRAY_SIZE(clkout_rates); i++) - if (clkout_rates[i] <= rate) - return clkout_rates[i]; + if (clkout_rates[i] <= req->rate) { + req->rate = clkout_rates[i]; + + return 0; + } + + req->rate = clkout_rates[0]; return 0; } @@ -413,7 +418,7 @@ static const struct clk_ops pcf8563_clkout_ops = { .unprepare = pcf8563_clkout_unprepare, .is_prepared = pcf8563_clkout_is_prepared, .recalc_rate = pcf8563_clkout_recalc_rate, - .round_rate = pcf8563_clkout_round_rate, + .determine_rate = pcf8563_clkout_determine_rate, .set_rate = pcf8563_clkout_set_rate, }; diff --git a/drivers/rtc/rtc-rv3028.c b/drivers/rtc/rtc-rv3028.c index 868d1b1eb0f4..c2a531f0e125 100644 --- a/drivers/rtc/rtc-rv3028.c +++ b/drivers/rtc/rtc-rv3028.c @@ -731,14 +731,19 @@ static unsigned long rv3028_clkout_recalc_rate(struct clk_hw *hw, return clkout_rates[clkout]; } -static long rv3028_clkout_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int rv3028_clkout_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int i; for (i = 0; i < ARRAY_SIZE(clkout_rates); i++) - if (clkout_rates[i] <= rate) - return clkout_rates[i]; + if (clkout_rates[i] <= req->rate) { + req->rate = clkout_rates[i]; + + return 0; + } + + req->rate = clkout_rates[0]; return 0; } @@ -802,7 +807,7 @@ static const struct clk_ops rv3028_clkout_ops = { .unprepare = rv3028_clkout_unprepare, .is_prepared = rv3028_clkout_is_prepared, .recalc_rate = rv3028_clkout_recalc_rate, - .round_rate = rv3028_clkout_round_rate, + .determine_rate = rv3028_clkout_determine_rate, .set_rate = rv3028_clkout_set_rate, }; diff --git a/drivers/rtc/rtc-rv3032.c b/drivers/rtc/rtc-rv3032.c index 2c6a8918acba..b8376bd1d905 100644 --- a/drivers/rtc/rtc-rv3032.c +++ b/drivers/rtc/rtc-rv3032.c @@ -646,19 +646,24 @@ static unsigned long rv3032_clkout_recalc_rate(struct clk_hw *hw, return clkout_xtal_rates[FIELD_GET(RV3032_CLKOUT2_FD_MSK, clkout)]; } -static long rv3032_clkout_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate) +static int rv3032_clkout_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { int i, hfd; - if (rate < RV3032_HFD_STEP) + if (req->rate < RV3032_HFD_STEP) for (i = 0; i < ARRAY_SIZE(clkout_xtal_rates); i++) - if (clkout_xtal_rates[i] <= rate) - return clkout_xtal_rates[i]; + if (clkout_xtal_rates[i] <= req->rate) { + req->rate = clkout_xtal_rates[i]; - hfd = DIV_ROUND_CLOSEST(rate, RV3032_HFD_STEP); + return 0; + } + + hfd = DIV_ROUND_CLOSEST(req->rate, RV3032_HFD_STEP); - return RV3032_HFD_STEP * clamp(hfd, 0, 8192); + req->rate = RV3032_HFD_STEP * clamp(hfd, 0, 8192); + + return 0; } static int rv3032_clkout_set_rate(struct clk_hw *hw, unsigned long rate, @@ -738,7 +743,7 @@ static const struct clk_ops rv3032_clkout_ops = { .unprepare = rv3032_clkout_unprepare, .is_prepared = rv3032_clkout_is_prepared, .recalc_rate = rv3032_clkout_recalc_rate, - .round_rate = rv3032_clkout_round_rate, + .determine_rate = rv3032_clkout_determine_rate, .set_rate = rv3032_clkout_set_rate, }; diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 5dd575865adf..79b2a16f15ad 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -549,25 +549,25 @@ static void s3c6410_rtc_irq(struct s3c_rtc *info, int mask) writeb(mask, info->base + S3C2410_INTP); } -static struct s3c_rtc_data const s3c2410_rtc_data = { +static const struct s3c_rtc_data s3c2410_rtc_data = { .irq_handler = s3c24xx_rtc_irq, .enable = s3c24xx_rtc_enable, .disable = s3c24xx_rtc_disable, }; -static struct s3c_rtc_data const s3c2416_rtc_data = { +static const struct s3c_rtc_data s3c2416_rtc_data = { .irq_handler = s3c24xx_rtc_irq, .enable = s3c24xx_rtc_enable, .disable = s3c24xx_rtc_disable, }; -static struct s3c_rtc_data const s3c2443_rtc_data = { +static const struct s3c_rtc_data s3c2443_rtc_data = { .irq_handler = s3c24xx_rtc_irq, .enable = s3c24xx_rtc_enable, .disable = s3c24xx_rtc_disable, }; -static struct s3c_rtc_data const s3c6410_rtc_data = { +static const struct s3c_rtc_data s3c6410_rtc_data = { .needs_src_clk = true, .irq_handler = s3c6410_rtc_irq, .enable = s3c24xx_rtc_enable, diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index f15ef3aa82a0..619800a00479 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -455,7 +455,7 @@ static void __exit sh_rtc_remove(struct platform_device *pdev) clk_disable(rtc->clk); } -static int __maybe_unused sh_rtc_suspend(struct device *dev) +static int sh_rtc_suspend(struct device *dev) { struct sh_rtc *rtc = dev_get_drvdata(dev); @@ -465,7 +465,7 @@ static int __maybe_unused sh_rtc_suspend(struct device *dev) return 0; } -static int __maybe_unused sh_rtc_resume(struct device *dev) +static int sh_rtc_resume(struct device *dev) { struct sh_rtc *rtc = dev_get_drvdata(dev); @@ -475,7 +475,7 @@ static int __maybe_unused sh_rtc_resume(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(sh_rtc_pm_ops, sh_rtc_suspend, sh_rtc_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sh_rtc_pm_ops, sh_rtc_suspend, sh_rtc_resume); static const struct of_device_id sh_rtc_of_match[] = { { .compatible = "renesas,sh-rtc", }, @@ -492,7 +492,7 @@ MODULE_DEVICE_TABLE(of, sh_rtc_of_match); static struct platform_driver sh_rtc_platform_driver __refdata = { .driver = { .name = DRV_NAME, - .pm = &sh_rtc_pm_ops, + .pm = pm_sleep_ptr(&sh_rtc_pm_ops), .of_match_table = sh_rtc_of_match, }, .remove = __exit_p(sh_rtc_remove), diff --git a/drivers/rtc/sysfs.c b/drivers/rtc/sysfs.c index e3062c4d3f2c..4ab05e105a76 100644 --- a/drivers/rtc/sysfs.c +++ b/drivers/rtc/sysfs.c @@ -24,8 +24,8 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%s %s\n", dev_driver_string(dev->parent), - dev_name(dev->parent)); + return sysfs_emit(buf, "%s %s\n", dev_driver_string(dev->parent), + dev_name(dev->parent)); } static DEVICE_ATTR_RO(name); @@ -39,7 +39,7 @@ date_show(struct device *dev, struct device_attribute *attr, char *buf) if (retval) return retval; - return sprintf(buf, "%ptRd\n", &tm); + return sysfs_emit(buf, "%ptRd\n", &tm); } static DEVICE_ATTR_RO(date); @@ -53,7 +53,7 @@ time_show(struct device *dev, struct device_attribute *attr, char *buf) if (retval) return retval; - return sprintf(buf, "%ptRt\n", &tm); + return sysfs_emit(buf, "%ptRt\n", &tm); } static DEVICE_ATTR_RO(time); @@ -64,21 +64,17 @@ since_epoch_show(struct device *dev, struct device_attribute *attr, char *buf) struct rtc_time tm; retval = rtc_read_time(to_rtc_device(dev), &tm); - if (retval == 0) { - time64_t time; - - time = rtc_tm_to_time64(&tm); - retval = sprintf(buf, "%lld\n", time); - } + if (retval) + return retval; - return retval; + return sysfs_emit(buf, "%lld\n", rtc_tm_to_time64(&tm)); } static DEVICE_ATTR_RO(since_epoch); static ssize_t max_user_freq_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", to_rtc_device(dev)->max_user_freq); + return sysfs_emit(buf, "%d\n", to_rtc_device(dev)->max_user_freq); } static ssize_t @@ -118,9 +114,9 @@ hctosys_show(struct device *dev, struct device_attribute *attr, char *buf) if (rtc_hctosys_ret == 0 && strcmp(dev_name(&to_rtc_device(dev)->dev), CONFIG_RTC_HCTOSYS_DEVICE) == 0) - return sprintf(buf, "1\n"); + return sysfs_emit(buf, "1\n"); #endif - return sprintf(buf, "0\n"); + return sysfs_emit(buf, "0\n"); } static DEVICE_ATTR_RO(hctosys); @@ -128,7 +124,6 @@ static ssize_t wakealarm_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t retval; - time64_t alarm; struct rtc_wkalrm alm; /* Don't show disabled alarms. For uniformity, RTC alarms are @@ -140,12 +135,13 @@ wakealarm_show(struct device *dev, struct device_attribute *attr, char *buf) * alarms after they trigger, to ensure one-shot semantics. */ retval = rtc_read_alarm(to_rtc_device(dev), &alm); - if (retval == 0 && alm.enabled) { - alarm = rtc_tm_to_time64(&alm.time); - retval = sprintf(buf, "%lld\n", alarm); - } + if (retval) + return retval; - return retval; + if (alm.enabled) + return sysfs_emit(buf, "%lld\n", rtc_tm_to_time64(&alm.time)); + + return 0; } static ssize_t @@ -222,10 +218,10 @@ offset_show(struct device *dev, struct device_attribute *attr, char *buf) long offset; retval = rtc_read_offset(to_rtc_device(dev), &offset); - if (retval == 0) - retval = sprintf(buf, "%ld\n", offset); + if (retval) + return retval; - return retval; + return sysfs_emit(buf, "%ld\n", offset); } static ssize_t @@ -246,8 +242,8 @@ static DEVICE_ATTR_RW(offset); static ssize_t range_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "[%lld,%llu]\n", to_rtc_device(dev)->range_min, - to_rtc_device(dev)->range_max); + return sysfs_emit(buf, "[%lld,%llu]\n", to_rtc_device(dev)->range_min, + to_rtc_device(dev)->range_max); } static DEVICE_ATTR_RO(range); @@ -302,11 +298,7 @@ static struct attribute_group rtc_attr_group = { .is_visible = rtc_attr_is_visible, .attrs = rtc_attrs, }; - -static const struct attribute_group *rtc_attr_groups[] = { - &rtc_attr_group, - NULL -}; +__ATTRIBUTE_GROUPS(rtc_attr); const struct attribute_group **rtc_get_dev_attribute_groups(void) { @@ -318,17 +310,21 @@ int rtc_add_groups(struct rtc_device *rtc, const struct attribute_group **grps) size_t old_cnt = 0, add_cnt = 0, new_cnt; const struct attribute_group **groups, **old; - if (!grps) + if (grps) { + for (groups = grps; *groups; groups++) + add_cnt++; + /* No need to modify current groups if nothing new is provided */ + if (add_cnt == 0) + return 0; + } else { return -EINVAL; + } groups = rtc->dev.groups; if (groups) for (; *groups; groups++) old_cnt++; - for (groups = grps; *groups; groups++) - add_cnt++; - new_cnt = old_cnt + add_cnt + 1; groups = devm_kcalloc(&rtc->dev, new_cnt, sizeof(*groups), GFP_KERNEL); if (!groups) diff --git a/drivers/rtc/lib_test.c b/drivers/rtc/test_rtc_lib.c index 0eebad1fe2a0..0eebad1fe2a0 100644 --- a/drivers/rtc/lib_test.c +++ b/drivers/rtc/test_rtc_lib.c diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f149ec28aefd..db3831f7f2f5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -82,7 +82,7 @@ repeat: if (folio_test_uptodate(folio)) goto out; - fio.page = &folio->page; + fio.folio = folio; err = f2fs_submit_page_bio(&fio); if (err) { @@ -309,7 +309,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, continue; } - fio.page = &folio->page; + fio.folio = folio; err = f2fs_submit_page_bio(&fio); f2fs_folio_put(folio, err ? true : false); @@ -485,7 +485,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, folio_mark_uptodate(folio); if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); return true; } return false; @@ -1045,7 +1045,7 @@ void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index b3c1df93a163..5c1f47e45dab 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -23,20 +23,18 @@ static struct kmem_cache *cic_entry_slab; static struct kmem_cache *dic_entry_slab; -static void *page_array_alloc(struct inode *inode, int nr) +static void *page_array_alloc(struct f2fs_sb_info *sbi, int nr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) return f2fs_kmem_cache_alloc(sbi->page_array_slab, - GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); + GFP_F2FS_ZERO, false, sbi); return f2fs_kzalloc(sbi, size, GFP_NOFS); } -static void page_array_free(struct inode *inode, void *pages, int nr) +static void page_array_free(struct f2fs_sb_info *sbi, void *pages, int nr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (!pages) @@ -73,17 +71,15 @@ static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) return cc->cluster_idx << cc->log_cluster_size; } -bool f2fs_is_compressed_page(struct page *page) +bool f2fs_is_compressed_page(struct folio *folio) { - if (!PagePrivate(page)) - return false; - if (!page_private(page)) + if (!folio->private) return false; - if (page_private_nonpointer(page)) + if (folio_test_f2fs_nonpointer(folio)) return false; - f2fs_bug_on(F2FS_P_SB(page), - *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); + f2fs_bug_on(F2FS_F_SB(folio), + *((u32 *)folio->private) != F2FS_COMPRESSED_PAGE_MAGIC); return true; } @@ -149,13 +145,13 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) if (cc->rpages) return 0; - cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); + cc->rpages = page_array_alloc(F2FS_I_SB(cc->inode), cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { - page_array_free(cc->inode, cc->rpages, cc->cluster_size); + page_array_free(F2FS_I_SB(cc->inode), cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; @@ -216,13 +212,13 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic) ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, dic->rbuf, &dic->rlen); if (ret != LZO_E_OK) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lzo decompress failed, ret:%d", ret); return -EIO; } if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lzo invalid rlen:%zu, expected:%lu", dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; @@ -296,13 +292,13 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, dic->clen, dic->rlen); if (ret < 0) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lz4 decompress failed, ret:%d", ret); return -EIO; } if (ret != PAGE_SIZE << dic->log_cluster_size) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "lz4 invalid ret:%d, expected:%lu", ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; @@ -424,13 +420,13 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) workspace_size = zstd_dstream_workspace_bound(max_window_size); - workspace = f2fs_vmalloc(F2FS_I_SB(dic->inode), workspace_size); + workspace = f2fs_vmalloc(dic->sbi, workspace_size); if (!workspace) return -ENOMEM; stream = zstd_init_dstream(max_window_size, workspace, workspace_size); if (!stream) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s zstd_init_dstream failed", __func__); vfree(workspace); return -EIO; @@ -466,14 +462,14 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic) ret = zstd_decompress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s zstd_decompress_stream failed, ret: %d", __func__, zstd_get_error_code(ret)); return -EIO; } if (dic->rlen != outbuf.pos) { - f2fs_err_ratelimited(F2FS_I_SB(dic->inode), + f2fs_err_ratelimited(dic->sbi, "%s ZSTD invalid rlen:%zu, expected:%lu", __func__, dic->rlen, PAGE_SIZE << dic->log_cluster_size); @@ -622,6 +618,7 @@ static void *f2fs_vmap(struct page **pages, unsigned int count) static int f2fs_compress_pages(struct compress_ctx *cc) { + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; @@ -642,7 +639,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); cc->valid_nr_cpages = cc->nr_cpages; - cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); + cc->cpages = page_array_alloc(sbi, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; @@ -716,7 +713,7 @@ out_free_cpages: if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) @@ -734,7 +731,7 @@ static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_sb_info *sbi = dic->sbi; struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; @@ -796,25 +793,27 @@ out_end_io: f2fs_decompress_end_io(dic, ret, in_task); } +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr); + /* * This is called when a page of a compressed cluster has been read from disk * (or failed to be read from disk). It checks whether this page was the last * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ -void f2fs_end_read_compressed_page(struct page *page, bool failed, +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); - struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct decompress_io_ctx *dic = folio->private; + struct f2fs_sb_info *sbi = dic->sbi; dec_page_count(sbi, F2FS_RD_DATA); if (failed) WRITE_ONCE(dic->failed, true); else if (blkaddr && in_task) - f2fs_cache_compressed_page(sbi, page, + f2fs_cache_compressed_page(sbi, folio, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) @@ -1340,7 +1339,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; atomic_set(&cic->pending_pages, cc->valid_nr_cpages); - cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + cic->rpages = page_array_alloc(sbi, cc->cluster_size); if (!cic->rpages) goto out_put_cic; @@ -1420,7 +1419,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, (*submitted)++; unlock_continue: inode_dec_dirty_pages(cc->inode); - unlock_page(fio.page); + folio_unlock(fio.folio); } if (fio.compr_blocks) @@ -1442,13 +1441,13 @@ unlock_continue: spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: - page_array_free(cc->inode, cic->rpages, cc->cluster_size); + page_array_free(sbi, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) { if (!cc->cpages[i]) @@ -1469,18 +1468,18 @@ out_free: f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + page_array_free(sbi, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; } -void f2fs_compress_write_end_io(struct bio *bio, struct page *page) +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio) { + struct page *page = &folio->page; struct f2fs_sb_info *sbi = bio->bi_private; - struct compress_io_ctx *cic = - (struct compress_io_ctx *)page_private(page); - enum count_type type = WB_DATA_TYPE(page, - f2fs_is_compressed_page(page)); + struct compress_io_ctx *cic = folio->private; + enum count_type type = WB_DATA_TYPE(folio, + f2fs_is_compressed_page(folio)); int i; if (unlikely(bio->bi_status != BLK_STS_OK)) @@ -1499,7 +1498,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) end_page_writeback(cic->rpages[i]); } - page_array_free(cic->inode, cic->rpages, cic->nr_rpages); + page_array_free(sbi, cic->rpages, cic->nr_rpages); kmem_cache_free(cic_entry_slab, cic); } @@ -1633,14 +1632,13 @@ static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool pre_alloc) { - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; int i; - if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) return 0; - dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + dic->tpages = page_array_alloc(dic->sbi, dic->cluster_size); if (!dic->tpages) return -ENOMEM; @@ -1670,10 +1668,9 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, bool bypass_destroy_callback, bool pre_alloc) { - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; - if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) return; if (!bypass_destroy_callback && cops->destroy_decompress_ctx) @@ -1700,7 +1697,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) if (!dic) return ERR_PTR(-ENOMEM); - dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + dic->rpages = page_array_alloc(sbi, cc->cluster_size); if (!dic->rpages) { kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); @@ -1708,6 +1705,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; + dic->sbi = sbi; + dic->compress_algorithm = F2FS_I(cc->inode)->i_compress_algorithm; atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; @@ -1721,7 +1720,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; - dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); + dic->cpages = page_array_alloc(sbi, dic->nr_cpages); if (!dic->cpages) { ret = -ENOMEM; goto out_free; @@ -1751,6 +1750,8 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, bool bypass_destroy_callback) { int i; + /* use sbi in dic to avoid UFA of dic->inode*/ + struct f2fs_sb_info *sbi = dic->sbi; f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); @@ -1762,7 +1763,7 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, continue; f2fs_compress_free_page(dic->tpages[i]); } - page_array_free(dic->inode, dic->tpages, dic->cluster_size); + page_array_free(sbi, dic->tpages, dic->cluster_size); } if (dic->cpages) { @@ -1771,10 +1772,10 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic, continue; f2fs_compress_free_page(dic->cpages[i]); } - page_array_free(dic->inode, dic->cpages, dic->nr_cpages); + page_array_free(sbi, dic->cpages, dic->nr_cpages); } - page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + page_array_free(sbi, dic->rpages, dic->nr_rpages); kmem_cache_free(dic_entry_slab, dic); } @@ -1793,8 +1794,7 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) f2fs_free_dic(dic, false); } else { INIT_WORK(&dic->free_work, f2fs_late_free_dic); - queue_work(F2FS_I_SB(dic->inode)->post_read_wq, - &dic->free_work); + queue_work(dic->sbi->post_read_wq, &dic->free_work); } } } @@ -1921,8 +1921,8 @@ void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr + len - 1); } -void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, - nid_t ino, block_t blkaddr) +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr) { struct folio *cfolio; int ret; @@ -1953,9 +1953,9 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, return; } - set_page_private_data(&cfolio->page, ino); + folio_set_f2fs_data(cfolio, ino); - memcpy(folio_address(cfolio), page_address(page), PAGE_SIZE); + memcpy(folio_address(cfolio), folio_address(folio), PAGE_SIZE); folio_mark_uptodate(cfolio); f2fs_folio_put(cfolio, true); } @@ -2012,7 +2012,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) continue; } - if (ino != get_page_private_data(&folio->page)) { + if (ino != folio_get_f2fs_data(folio)) { folio_unlock(folio); continue; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 711ad80b38d0..7961e0ddfca3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,14 +47,14 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -bool f2fs_is_cp_guaranteed(struct page *page) +bool f2fs_is_cp_guaranteed(const struct folio *folio) { - struct address_space *mapping = page_folio(page)->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode; struct f2fs_sb_info *sbi; - if (fscrypt_is_bounce_page(page)) - return page_private_gcing(fscrypt_pagecache_page(page)); + if (fscrypt_is_bounce_folio(folio)) + return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio)); inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -65,7 +65,7 @@ bool f2fs_is_cp_guaranteed(struct page *page) return true; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || - page_private_gcing(page)) + folio_test_f2fs_gcing(folio)) return true; return false; } @@ -142,9 +142,9 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; - if (f2fs_is_compressed_page(&folio->page)) { + if (f2fs_is_compressed_page(folio)) { if (ctx && !ctx->decompression_attempted) - f2fs_end_read_compressed_page(&folio->page, true, 0, + f2fs_end_read_compressed_page(folio, true, 0, in_task); f2fs_put_folio_dic(folio, in_task); continue; @@ -181,14 +181,13 @@ static void f2fs_verify_bio(struct work_struct *work) * as those were handled separately by f2fs_end_read_compressed_page(). */ if (may_have_compressed_pages) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - if (!f2fs_is_compressed_page(page) && - !fsverity_verify_page(page)) { + if (!f2fs_is_compressed_page(folio) && + !fsverity_verify_page(&folio->page)) { bio->bi_status = BLK_STS_IOERR; break; } @@ -233,16 +232,15 @@ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bool in_task) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; bool all_compressed = true; block_t blkaddr = ctx->fs_blkaddr; - bio_for_each_segment_all(bv, ctx->bio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, ctx->bio) { + struct folio *folio = fi.folio; - if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, false, blkaddr, + if (f2fs_is_compressed_page(folio)) + f2fs_end_read_compressed_page(folio, false, blkaddr, in_task); else all_compressed = false; @@ -280,9 +278,9 @@ static void f2fs_post_read_work(struct work_struct *work) static void f2fs_read_end_io(struct bio *bio) { - struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct f2fs_sb_info *sbi = F2FS_F_SB(bio_first_folio_all(bio)); struct bio_post_read_ctx *ctx; - bool intask = in_task(); + bool intask = in_task() && !irqs_disabled(); iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; @@ -339,13 +337,13 @@ static void f2fs_write_end_io(struct bio *bio) } #ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_is_compressed_page(&folio->page)) { - f2fs_compress_write_end_io(bio, &folio->page); + if (f2fs_is_compressed_page(folio)) { + f2fs_compress_write_end_io(bio, folio); continue; } #endif - type = WB_DATA_TYPE(&folio->page, false); + type = WB_DATA_TYPE(folio, false); if (unlikely(bio->bi_status != BLK_STS_OK)) { mapping_set_error(folio->mapping, -EIO); @@ -355,12 +353,12 @@ static void f2fs_write_end_io(struct bio *bio) } f2fs_bug_on(sbi, is_node_folio(folio) && - folio->index != nid_of_node(&folio->page)); + folio->index != nid_of_node(folio)); dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, folio)) f2fs_del_fsync_node_entry(sbi, folio); - clear_page_private_gcing(&folio->page); + folio_clear_f2fs_gcing(folio); folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && @@ -419,7 +417,6 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); - struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -447,7 +444,7 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) op_flags |= REQ_FUA; if (fio->type == DATA && - F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + F2FS_I(fio->folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) op_flags |= REQ_PRIO; return op_flags; @@ -546,14 +543,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) } static bool __has_merged_page(struct bio *bio, struct inode *inode, - struct page *page, nid_t ino) + struct folio *folio, nid_t ino) { struct folio_iter fi; if (!bio) return false; - if (!inode && !page && !ino) + if (!inode && !folio && !ino) return true; bio_for_each_folio_all(fi, bio) { @@ -564,7 +561,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (IS_ERR(target)) continue; } - if (f2fs_is_compressed_page(&target->page)) { + if (f2fs_is_compressed_page(target)) { target = f2fs_compress_control_folio(target); if (IS_ERR(target)) continue; @@ -572,9 +569,9 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (inode && inode == target->mapping->host) return true; - if (page && page == &target->page) + if (folio && folio == target) return true; - if (ino && ino == ino_of_node(&target->page)) + if (ino && ino == ino_of_node(target)) return true; } @@ -641,7 +638,7 @@ unlock_out: } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type, bool force) { enum temp_type temp; @@ -653,7 +650,7 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_read(&io->io_rwsem); - ret = __has_merged_page(io->bio, inode, page, ino); + ret = __has_merged_page(io->bio, inode, folio, ino); f2fs_up_read(&io->io_rwsem); } if (ret) @@ -671,10 +668,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, page, ino, type, false); + __submit_merged_write_cond(sbi, inode, folio, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@ -691,7 +688,7 @@ void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; - struct folio *fio_folio = page_folio(fio->page); + struct folio *fio_folio = fio->folio; struct folio *data_folio = fio->encrypted_page ? page_folio(fio->encrypted_page) : fio_folio; @@ -713,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(data_folio) : WB_DATA_TYPE(fio->page, false)); + __read_io_type(data_folio) : WB_DATA_TYPE(fio->folio, false)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); @@ -779,7 +776,7 @@ static void del_bio_entry(struct bio_entry *be) static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { - struct folio *fio_folio = page_folio(fio->page); + struct folio *fio_folio = fio->folio; struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; @@ -848,7 +845,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - &folio->page, 0); + folio, 0); if (found) break; } @@ -865,7 +862,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - &folio->page, 0); + folio, 0); if (found) { target = be->bio; del_bio_entry(be); @@ -886,15 +883,15 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; - struct page *page = fio->encrypted_page ? - fio->encrypted_page : fio->page; - struct folio *folio = page_folio(fio->page); + struct folio *data_folio = fio->encrypted_page ? + page_folio(fio->encrypted_page) : fio->folio; + struct folio *folio = fio->folio; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) return -EFSCORRUPTED; - trace_f2fs_submit_folio_bio(page_folio(page), fio); + trace_f2fs_submit_folio_bio(data_folio, fio); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -905,16 +902,16 @@ alloc_new: f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, folio->index, fio, GFP_NOIO); - add_bio_entry(fio->sbi, bio, page, fio->temp); + add_bio_entry(fio->sbi, bio, &data_folio->page, fio->temp); } else { - if (add_ipu_page(fio, &bio, page)) + if (add_ipu_page(fio, &bio, &data_folio->page)) goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -949,7 +946,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; - struct page *bio_page; + struct folio *bio_folio; enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); @@ -980,44 +977,44 @@ next: verify_fio_blkaddr(fio); if (fio->encrypted_page) - bio_page = fio->encrypted_page; + bio_folio = page_folio(fio->encrypted_page); else if (fio->compressed_page) - bio_page = fio->compressed_page; + bio_folio = page_folio(fio->compressed_page); else - bio_page = fio->page; + bio_folio = fio->folio; /* set submitted = true as a return value */ fio->submitted = 1; - type = WB_DATA_TYPE(bio_page, fio->compressed_page); + type = WB_DATA_TYPE(bio_folio, fio->compressed_page); inc_page_count(sbi, type); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, fio->new_blkaddr) || !f2fs_crypt_mergeable_bio(io->bio, fio_inode(fio), - page_folio(bio_page)->index, fio))) + bio_folio->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio_inode(fio), - page_folio(bio_page)->index, fio, GFP_NOIO); + bio_folio->index, fio, GFP_NOIO); io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { + if (!bio_add_folio(io->bio, bio_folio, folio_size(bio_folio), 0)) { __submit_merged_bio(io); goto alloc_new; } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), - PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->folio, + folio_size(fio->folio)); io->last_block_in_bio = fio->new_blkaddr; - trace_f2fs_submit_folio_write(page_folio(fio->page), fio); + trace_f2fs_submit_folio_write(fio->folio, fio); #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { @@ -1553,10 +1550,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) unsigned int start_pgofs; int bidx = 0; bool is_hole; + bool lfs_dio_write; if (!maxblocks) return 0; + lfs_dio_write = (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && + map->m_may_create); + if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) goto out; @@ -1572,8 +1573,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) end = pgofs + maxblocks; next_dnode: - if (map->m_may_create) + if (map->m_may_create) { + if (f2fs_lfs_mode(sbi)) + f2fs_balance_fs(sbi, true); f2fs_map_lock(sbi, flag); + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -1589,7 +1593,7 @@ next_dnode: start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); @@ -1603,7 +1607,7 @@ next_block: /* use out-place-update for direct IO under LFS mode */ if (map->m_may_create && (is_hole || (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && - !f2fs_is_pinned_file(inode)))) { + !f2fs_is_pinned_file(inode) && map->m_last_pblk != blkaddr))) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto sync_out; @@ -1687,10 +1691,15 @@ next_block: if (map->m_multidev_dio) map->m_bdev = FDEV(bidx).bdev; + + if (lfs_dio_write) + map->m_last_pblk = NULL_ADDR; } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) { ofs++; map->m_len++; } else { + if (lfs_dio_write && !f2fs_is_pinned_file(inode)) + map->m_last_pblk = blkaddr; goto sync_out; } @@ -1715,14 +1724,6 @@ skip: dn.ofs_in_node = end_offset; } - if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && - map->m_may_create) { - /* the next block to be allocated may not be contiguous. */ - if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) == - CAP_BLKS_PER_SEC(sbi) - 1) - goto sync_out; - } - if (pgofs >= end) goto sync_out; else if (dn.ofs_in_node < end_offset) @@ -2303,7 +2304,7 @@ submit_and_realloc: } if (!bio) { - bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, f2fs_ra_op_flags(rac), folio->index, for_write); if (IS_ERR(bio)) { @@ -2376,6 +2377,14 @@ static int f2fs_mpage_readpages(struct inode *inode, unsigned max_nr_pages = nr_pages; int ret = 0; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + index = rac ? readahead_index(rac) : folio->index; + max_nr_pages = round_up(index + nr_pages, cc.cluster_size) - + round_down(index, cc.cluster_size); + } +#endif + map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; @@ -2642,7 +2651,7 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) int f2fs_do_write_data_page(struct f2fs_io_info *fio) { - struct folio *folio = page_folio(fio->page); + struct folio *folio = fio->folio; struct inode *inode = folio->mapping->host; struct dnode_of_data dn; struct node_info ni; @@ -2652,7 +2661,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* Use COW inode to make dnode_of_data for atomic write */ atomic_commit = f2fs_is_atomic_file(inode) && - page_private_atomic(folio_page(folio, 0)); + folio_test_f2fs_atomic(folio); if (atomic_commit) set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); else @@ -2683,7 +2692,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { folio_clear_uptodate(folio); - clear_page_private_gcing(folio_page(folio, 0)); + folio_clear_f2fs_gcing(folio); goto out_writepage; } got_it: @@ -2753,7 +2762,7 @@ got_it: trace_f2fs_do_write_data_page(folio, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (atomic_commit) - clear_page_private_atomic(folio_page(folio, 0)); + folio_clear_f2fs_atomic(folio); out_writepage: f2fs_put_dnode(&dn); out: @@ -2771,7 +2780,6 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, bool allow_balance) { struct inode *inode = folio->mapping->host; - struct page *page = folio_page(folio, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long)i_size) @@ -2788,7 +2796,7 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NULL_ADDR, - .page = page, + .folio = folio, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, @@ -2890,7 +2898,7 @@ out: inode_dec_dirty_pages(inode); if (err) { folio_clear_uptodate(folio); - clear_page_private_gcing(page); + folio_clear_f2fs_gcing(folio); } folio_unlock(folio); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && @@ -3376,7 +3384,7 @@ restart: f2fs_do_read_inline_data(folio, ifolio); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_page_private_inline(&ifolio->page); + folio_set_f2fs_inline(ifolio); goto out; } err = f2fs_convert_inline_folio(&dn, folio); @@ -3698,7 +3706,7 @@ static int f2fs_write_end(const struct kiocb *iocb, folio_mark_dirty(folio); if (f2fs_is_atomic_file(inode)) - set_page_private_atomic(folio_page(folio, 0)); + folio_set_f2fs_atomic(folio); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { @@ -3733,7 +3741,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) f2fs_remove_dirty_inode(inode); } } - clear_page_private_all(&folio->page); + folio_detach_private(folio); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) @@ -3742,7 +3750,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - clear_page_private_all(&folio->page); + folio_detach_private(folio); return true; } @@ -4160,7 +4168,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { - struct f2fs_map_blocks map = {}; + struct f2fs_map_blocks map = { NULL, }; pgoff_t next_pgofs = 0; int err; @@ -4169,6 +4177,10 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); + if (flags & IOMAP_WRITE && iomap->private) { + map.m_last_pblk = (unsigned long)iomap->private; + iomap->private = NULL; + } /* * If the blocks being overwritten are already allocated, @@ -4207,6 +4219,9 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); + + if (flags & IOMAP_WRITE && map.m_last_pblk) + iomap->private = (void *)map.m_last_pblk; } else { if (flags & IOMAP_WRITE) return -ENOTBLK; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 16c2dfb4f595..43a83bbd3bc5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,7 +21,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); +static DEFINE_SPINLOCK(f2fs_stat_lock); #ifdef CONFIG_DEBUG_FS static struct dentry *f2fs_debugfs_root; #endif @@ -91,7 +91,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi) seg_blks = get_seg_entry(sbi, j)->valid_blocks; /* update segment stats */ - if (IS_CURSEG(sbi, j)) + if (is_curseg(sbi, j)) dev_stats[i].devstats[0][DEVSTAT_INUSE]++; else if (seg_blks == BLKS_PER_SEG(sbi)) dev_stats[i].devstats[0][DEVSTAT_FULL]++; @@ -109,7 +109,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi) sec_blks = get_sec_entry(sbi, j)->valid_blocks; /* update section stats */ - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j))) + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, j))) dev_stats[i].devstats[1][DEVSTAT_INUSE]++; else if (sec_blks == BLKS_PER_SEC(sbi)) dev_stats[i].devstats[1][DEVSTAT_FULL]++; @@ -439,9 +439,8 @@ static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; int i = 0, j = 0; - unsigned long flags; - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_for_each_entry(si, &f2fs_stat_list, stat_list) { struct f2fs_sb_info *sbi = si->sbi; @@ -753,7 +752,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); return 0; } @@ -765,7 +764,6 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; struct f2fs_dev_stats *dev_stats; - unsigned long flags; int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); @@ -817,9 +815,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->max_aw_cnt, 0); - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_add_tail(&si->stat_list, &f2fs_stat_list); - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); return 0; } @@ -827,11 +825,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned long flags; - raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + spin_lock(&f2fs_stat_lock); list_del(&si->stat_list); - raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + spin_unlock(&f2fs_stat_lock); kfree(si->dev_stats); kfree(si); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c36b3b22bfff..fffd7749d6d1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -454,7 +454,7 @@ static void init_dent_inode(struct inode *dir, struct inode *inode, f2fs_folio_wait_writeback(ifolio, NODE, true, true); /* copy name info. to this inode folio */ - ri = F2FS_INODE(&ifolio->page); + ri = F2FS_INODE(ifolio); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); if (IS_ENCRYPTED(dir)) { @@ -897,7 +897,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, f2fs_clear_page_cache_dirty_tag(folio); folio_clear_dirty_for_io(folio); folio_clear_uptodate(folio); - clear_page_private_all(&folio->page); + folio_detach_private(folio); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index cfe925a3d555..199c1e7a83ef 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -19,10 +19,10 @@ #include "node.h" #include <trace/events/f2fs.h> -bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; struct extent_info ei; int devi; @@ -411,10 +411,10 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; - struct f2fs_extent *i_ext = &F2FS_INODE(&ifolio->page)->i_ext; + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; struct extent_tree *et; struct extent_node *en; - struct extent_info ei; + struct extent_info ei = {0}; if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ @@ -934,7 +934,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ if (!__may_extent_tree(dn->inode, type)) return; - ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), dn->inode) + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) + dn->ofs_in_node; ei.len = 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c78464792ceb..46be7560548c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -386,7 +386,7 @@ struct discard_cmd { struct rb_node rb_node; /* rb node located in rb-tree */ struct discard_info di; /* discard info */ struct list_head list; /* command list */ - struct completion wait; /* compleation */ + struct completion wait; /* completion */ struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ @@ -732,6 +732,7 @@ struct f2fs_map_blocks { block_t m_lblk; unsigned int m_len; unsigned int m_flags; + unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; @@ -875,6 +876,7 @@ struct f2fs_inode_info { /* linked in global inode list for cache donation */ struct list_head gdonate_list; pgoff_t donate_start, donate_end; /* inclusive */ + atomic_t open_count; /* # of open files */ struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; @@ -1123,8 +1125,8 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ -#define WB_DATA_TYPE(p, f) \ - (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) +#define WB_DATA_TYPE(folio, f) \ + (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -1240,7 +1242,10 @@ struct f2fs_io_info { blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ - struct page *page; /* page to be written */ + union { + struct page *page; /* page to be written */ + struct folio *folio; + }; struct page *encrypted_page; /* encrypted page */ struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ @@ -1286,7 +1291,7 @@ struct f2fs_bio_info { struct f2fs_dev_info { struct file *bdev_file; struct block_device *bdev; - char path[MAX_PATH_LEN]; + char path[MAX_PATH_LEN + 1]; unsigned int total_segments; block_t start_blk; block_t end_blk; @@ -1427,7 +1432,7 @@ enum { enum { MEMORY_MODE_NORMAL, /* memory mode for normal devices */ - MEMORY_MODE_LOW, /* memory mode for low memry devices */ + MEMORY_MODE_LOW, /* memory mode for low memory devices */ }; enum errors_option { @@ -1491,7 +1496,7 @@ enum compress_flag { #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ - __le32 chksum; /* compressed data chksum */ + __le32 chksum; /* compressed data checksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -1536,6 +1541,7 @@ struct compress_io_ctx { struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ pgoff_t cluster_idx; /* cluster index number */ unsigned int cluster_size; /* page count in cluster */ unsigned int log_cluster_size; /* log of cluster size */ @@ -1576,6 +1582,7 @@ struct decompress_io_ctx { bool failed; /* IO error occurred before decompression? */ bool need_verity; /* need fs-verity verification after decompression? */ + unsigned char compress_algorithm; /* backup algorithm type */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ @@ -1724,6 +1731,9 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ + /* free sections reserved for pinned file */ + unsigned int reserved_pin_section; + /* threshold for gc trials on pinned files */ unsigned short gc_pin_file_threshold; struct f2fs_rwsem pin_sem; @@ -2013,16 +2023,11 @@ static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) return F2FS_I_SB(mapping->host); } -static inline struct f2fs_sb_info *F2FS_F_SB(struct folio *folio) +static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio) { return F2FS_M_SB(folio->mapping); } -static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) -{ - return F2FS_F_SB(page_folio(page)); -} - static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); @@ -2043,14 +2048,14 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } -static inline struct f2fs_node *F2FS_NODE(const struct page *page) +static inline struct f2fs_node *F2FS_NODE(const struct folio *folio) { - return (struct f2fs_node *)page_address(page); + return (struct f2fs_node *)folio_address(folio); } -static inline struct f2fs_inode *F2FS_INODE(struct page *page) +static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio) { - return &((struct f2fs_node *)page_address(page))->i; + return &((struct f2fs_node *)folio_address(folio))->i; } static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) @@ -2453,6 +2458,13 @@ release_quota: } #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool folio_test_f2fs_##name(const struct folio *folio) \ +{ \ + unsigned long priv = (unsigned long)folio->private; \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + return (priv & v) == v; \ +} \ static inline bool page_private_##name(struct page *page) \ { \ return PagePrivate(page) && \ @@ -2461,6 +2473,17 @@ static inline bool page_private_##name(struct page *page) \ } #define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void folio_set_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + if (!folio->private) \ + folio_attach_private(folio, (void *)v); \ + else { \ + v |= (unsigned long)folio->private; \ + folio->private = (void *)v; \ + } \ +} \ static inline void set_page_private_##name(struct page *page) \ { \ if (!PagePrivate(page)) \ @@ -2470,6 +2493,16 @@ static inline void set_page_private_##name(struct page *page) \ } #define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void folio_clear_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (unsigned long)folio->private; \ + \ + v &= ~(1UL << PAGE_PRIVATE_##flagname); \ + if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \ + folio_detach_private(folio); \ + else \ + folio->private = (void *)v; \ +} \ static inline void clear_page_private_##name(struct page *page) \ { \ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ @@ -2492,39 +2525,23 @@ PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); -static inline unsigned long get_page_private_data(struct page *page) +static inline unsigned long folio_get_f2fs_data(struct folio *folio) { - unsigned long data = page_private(page); + unsigned long data = (unsigned long)folio->private; if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) return 0; return data >> PAGE_PRIVATE_MAX; } -static inline void set_page_private_data(struct page *page, unsigned long data) +static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data) { - if (!PagePrivate(page)) - attach_page_private(page, (void *)0); - set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); - page_private(page) |= data << PAGE_PRIVATE_MAX; -} - -static inline void clear_page_private_data(struct page *page) -{ - page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); - if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) - detach_page_private(page); -} + data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX); -static inline void clear_page_private_all(struct page *page) -{ - clear_page_private_data(page); - clear_page_private_reference(page); - clear_page_private_gcing(page); - clear_page_private_inline(page); - clear_page_private_atomic(page); - - f2fs_bug_on(F2FS_P_SB(page), page_private(page)); + if (!folio_test_private(folio)) + folio_attach_private(folio, (void *)data); + else + folio->private = (void *)((unsigned long)folio->private | data); } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, @@ -3011,9 +3028,9 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) -static inline bool IS_INODE(struct page *page) +static inline bool IS_INODE(const struct folio *folio) { - struct f2fs_node *p = F2FS_NODE(page); + struct f2fs_node *p = F2FS_NODE(folio); return RAW_IS_INODE(p); } @@ -3031,20 +3048,20 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node) static inline int f2fs_has_extra_attr(struct inode *inode); static inline unsigned int get_dnode_base(struct inode *inode, - struct page *node_page) + struct folio *node_folio) { - if (!IS_INODE(node_page)) + if (!IS_INODE(node_folio)) return 0; return inode ? get_extra_isize(inode) : - offset_in_addr(&F2FS_NODE(node_page)->i); + offset_in_addr(&F2FS_NODE(node_folio)->i); } static inline __le32 *get_dnode_addr(struct inode *inode, struct folio *node_folio) { - return blkaddr_in_node(F2FS_NODE(&node_folio->page)) + - get_dnode_base(inode, &node_folio->page); + return blkaddr_in_node(F2FS_NODE(node_folio)) + + get_dnode_base(inode, node_folio); } static inline block_t data_blkaddr(struct inode *inode, @@ -3366,9 +3383,10 @@ static inline unsigned int addrs_per_page(struct inode *inode, return addrs; } -static inline void *inline_xattr_addr(struct inode *inode, struct folio *folio) +static inline +void *inline_xattr_addr(struct inode *inode, const struct folio *folio) { - struct f2fs_inode *ri = F2FS_INODE(&folio->page); + struct f2fs_inode *ri = F2FS_INODE(folio); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)]); @@ -3628,13 +3646,14 @@ int f2fs_pin_file_control(struct inode *inode, bool inc); */ void f2fs_set_inode_flags(struct inode *inode); bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio); -void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_update_inode(struct inode *inode, struct folio *node_folio); void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_remove_donate_inode(struct inode *inode); void f2fs_evict_inode(struct inode *inode); void f2fs_handle_failed_inode(struct inode *inode); @@ -3784,8 +3803,8 @@ void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio); -int f2fs_recover_xattr_data(struct inode *inode, struct page *page); -int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3852,7 +3871,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, bool recover_newaddr); enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type seg_type); -int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); @@ -3886,7 +3905,7 @@ unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, static inline struct inode *fio_inode(struct f2fs_io_info *fio) { - return page_folio(fio->page)->mapping->host; + return fio->folio->mapping->host; } #define DEF_FRAGMENT_SIZE 4 @@ -3953,7 +3972,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -bool f2fs_is_cp_guaranteed(struct page *page); +bool f2fs_is_cp_guaranteed(const struct folio *folio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, @@ -3961,7 +3980,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, + struct inode *inode, struct folio *folio, nid_t ino, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct folio *folio); @@ -4303,7 +4322,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); -bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage); +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio); void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, @@ -4345,7 +4364,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -bool sanity_check_extent_cache(struct inode *inode, struct page *ipage); +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); @@ -4435,20 +4454,20 @@ enum cluster_check_type { CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */ CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */ }; -bool f2fs_is_compressed_page(struct page *page); +bool f2fs_is_compressed_page(struct folio *folio); struct folio *f2fs_compress_control_folio(struct folio *folio); int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied); int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); -void f2fs_compress_write_end_io(struct bio *bio, struct page *page); +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio); bool f2fs_is_compress_backend_ready(struct inode *inode); bool f2fs_is_compress_level_valid(int alg, int lvl); int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); -void f2fs_end_read_compressed_page(struct page *page, bool failed, +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); @@ -4486,8 +4505,6 @@ void f2fs_destroy_compress_cache(void); struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len); -void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, - nid_t ino, block_t blkaddr); bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr); void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); @@ -4504,7 +4521,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); sbi->compr_saved_block += diff; \ } while (0) #else -static inline bool f2fs_is_compressed_page(struct page *page) { return false; } +static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) { if (!f2fs_compressed_file(inode)) @@ -4522,7 +4539,7 @@ static inline int __init f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { } -static inline void f2fs_end_read_compressed_page(struct page *page, +static inline void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); @@ -4542,8 +4559,6 @@ static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len) { } -static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, - struct page *page, nid_t ino, block_t blkaddr) { } static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr) { return false; } static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c677230699fd..42faaed6a02d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -489,7 +489,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -629,7 +629,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; - return finish_preallocate_blocks(inode); + err = finish_preallocate_blocks(inode); + if (!err) + atomic_inc(&F2FS_I(inode)->open_count); + return err; } void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -708,7 +711,7 @@ next: * once we invalidate valid blkaddr in range [ofs, ofs + count], * we will invalidate all blkaddr in the whole range. */ - fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); f2fs_update_age_extent_cache_range(dn, fofs, len); @@ -815,12 +818,12 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } - count = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + count = ADDRS_PER_PAGE(dn.node_folio, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); - if (dn.ofs_in_node || IS_INODE(&dn.node_folio->page)) { + if (dn.ofs_in_node || IS_INODE(dn.node_folio)) { f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } @@ -1043,11 +1046,24 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = setattr_prepare(idmap, dentry, attr); + if (err) + return err; + + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = fsverity_prepare_setattr(dentry, attr); + if (err) + return err; + if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -1064,20 +1080,19 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, !IS_ALIGNED(attr->ia_size, F2FS_BLK_TO_BYTES(fi->i_cluster_size))) return -EINVAL; + /* + * To prevent scattered pin block generation, we don't allow + * smaller/equal size unaligned truncation for pinned file. + * We only support overwrite IO to pinned file, so don't + * care about larger size truncation. + */ + if (f2fs_is_pinned_file(inode) && + attr->ia_size <= i_size_read(inode) && + !IS_ALIGNED(attr->ia_size, + F2FS_BLK_TO_BYTES(CAP_BLKS_PER_SEC(sbi)))) + return -EINVAL; } - err = setattr_prepare(idmap, dentry, attr); - if (err) - return err; - - err = fscrypt_prepare_setattr(dentry, attr); - if (err) - return err; - - err = fsverity_prepare_setattr(dentry, attr); - if (err) - return err; - if (is_quota_modification(idmap, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) @@ -1085,12 +1100,11 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); err = dquot_transfer(idmap, inode, attr); if (err) { - set_sbi_flag(F2FS_I_SB(inode), - SBI_QUOTA_NEED_REPAIR); - f2fs_unlock_op(F2FS_I_SB(inode)); + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_unlock_op(sbi); return err; } /* @@ -1100,7 +1114,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); - f2fs_unlock_op(F2FS_I_SB(inode)); + f2fs_unlock_op(sbi); } if (attr->ia_valid & ATTR_SIZE) { @@ -1163,7 +1177,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); return err; } @@ -1223,7 +1237,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -1322,7 +1336,7 @@ next_dnode: goto next; } - done = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, inode) - + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = f2fs_data_blkaddr(&dn); @@ -1411,7 +1425,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } ilen = min((pgoff_t) - ADDRS_PER_PAGE(&dn.node_folio->page, dst_inode) - + ADDRS_PER_PAGE(dn.node_folio, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = f2fs_data_blkaddr(&dn); @@ -1453,7 +1467,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, memcpy_folio(fdst, 0, fsrc, 0, PAGE_SIZE); folio_mark_dirty(fdst); - set_page_private_gcing(&fdst->page); + folio_set_f2fs_gcing(fdst); f2fs_folio_put(fdst, true); f2fs_folio_put(fsrc, true); @@ -1707,7 +1721,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); end = min(pg_end, end_offset - dn.ofs_in_node + index); ret = f2fs_do_zero_range(&dn, index, end); @@ -1888,9 +1902,8 @@ next_alloc: } } - if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? - ZONED_PIN_SEC_REQUIRED_COUNT : - GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { + if (has_not_enough_free_secs(sbi, 0, + sbi->reserved_pin_section)) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); @@ -2028,6 +2041,9 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + if (atomic_dec_and_test(&F2FS_I(inode)->open_count)) + f2fs_remove_donate_inode(inode); + /* * f2fs_release_file is called at every close calls. So we should * not drop any inmemory pages by close called by other process. @@ -2978,7 +2994,7 @@ do_map: f2fs_folio_wait_writeback(folio, DATA, true, true); folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); f2fs_folio_put(folio, true); idx++; @@ -3876,7 +3892,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4054,7 +4070,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4218,7 +4234,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto out; } - end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); count = min(end_offset - dn.ofs_in_node, pg_end - index); for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { struct block_device *cur_bdev; @@ -4415,7 +4431,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) f2fs_folio_wait_writeback(folio, DATA, true, true); folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); redirty_idx = folio_next_index(folio); folio_unlock(folio); folio_put_refs(folio, 2); @@ -4825,6 +4841,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); const loff_t pos = iocb->ki_pos; ssize_t ret; + bool dio; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -4833,12 +4850,15 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos, iov_iter_count(to), READ); + dio = f2fs_should_use_dio(inode, iocb, to); + /* In LFS mode, if there is inflight dio, wait for its completion */ if (f2fs_lfs_mode(F2FS_I_SB(inode)) && - get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE)) + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE) && + (!f2fs_is_pinned_file(inode) || !dio)) inode_dio_wait(inode); - if (f2fs_should_use_dio(inode, iocb, to)) { + if (dio) { ret = f2fs_dio_read_iter(iocb, to); } else { ret = filemap_read(iocb, to, 0); @@ -4846,8 +4866,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_READ_IO, ret); } - if (trace_f2fs_dataread_end_enabled()) - trace_f2fs_dataread_end(inode, pos, ret); + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4870,8 +4889,7 @@ static ssize_t f2fs_file_splice_read(struct file *in, loff_t *ppos, f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_READ_IO, ret); - if (trace_f2fs_dataread_end_enabled()) - trace_f2fs_dataread_end(inode, pos, ret); + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -5216,8 +5234,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from); - if (trace_f2fs_datawrite_end_enabled()) - trace_f2fs_datawrite_end(inode, orig_pos, ret); + trace_f2fs_datawrite_end(inode, orig_pos, ret); } /* Don't leave any preallocated blocks around past i_size. */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3cb5242f4ddf..098e9f71421e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -141,10 +141,10 @@ do_gc: FOREGROUND : BACKGROUND); sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) || - gc_control.one_time; + (gc_control.one_time && gc_th->boost_gc_greedy); /* foreground GC was been triggered via f2fs_balance_fs() */ - if (foreground) + if (foreground && !f2fs_sb_has_blkzoned(sbi)) sync_mode = false; gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; @@ -197,6 +197,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO; + gc_th->boost_gc_multiple = BOOST_GC_MULTIPLE; + gc_th->boost_gc_greedy = GC_GREEDY; if (f2fs_sb_has_blkzoned(sbi)) { gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED; @@ -278,12 +280,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode == SSR) { - p->gc_mode = GC_GREEDY; - p->dirty_bitmap = dirty_i->dirty_segmap[type]; - p->max_search = dirty_i->nr_dirty[type]; - p->ofs_unit = 1; - } else if (p->alloc_mode == AT_SSR) { + if (p->alloc_mode == SSR || p->alloc_mode == AT_SSR) { p->gc_mode = GC_GREEDY; p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; @@ -389,14 +386,15 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) } static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, - unsigned int segno, struct victim_sel_policy *p) + unsigned int segno, struct victim_sel_policy *p, + unsigned int valid_thresh_ratio) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; - if (p->one_time_gc && (get_valid_blocks(sbi, segno, true) >= - CAP_BLKS_PER_SEC(sbi) * sbi->gc_thread->valid_thresh_ratio / - 100)) + if (p->one_time_gc && (valid_thresh_ratio < 100) && + (get_valid_blocks(sbi, segno, true) >= + CAP_BLKS_PER_SEC(sbi) * valid_thresh_ratio / 100)) return UINT_MAX; /* alloc_mode == LFS */ @@ -777,6 +775,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, unsigned int secno, last_victim; unsigned int last_segment; unsigned int nsearched; + unsigned int valid_thresh_ratio = 100; bool is_atgc; int ret = 0; @@ -786,7 +785,11 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, p.alloc_mode = alloc_mode; p.age = age; p.age_threshold = sbi->am.age_threshold; - p.one_time_gc = one_time; + if (one_time) { + p.one_time_gc = one_time; + if (has_enough_free_secs(sbi, 0, NR_PERSISTENT_LOG)) + valid_thresh_ratio = sbi->gc_thread->valid_thresh_ratio; + } retry: select_policy(sbi, gc_type, type, &p); @@ -912,7 +915,7 @@ retry: goto next; } - cost = get_gc_cost(sbi, segno, &p); + cost = get_gc_cost(sbi, segno, &p, valid_thresh_ratio); if (p.min_cost > cost) { p.min_segno = segno; @@ -1162,8 +1165,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - if (IS_INODE(&node_folio->page)) { - base = offset_in_addr(F2FS_INODE(&node_folio->page)); + if (IS_INODE(node_folio)) { + base = offset_in_addr(F2FS_INODE(node_folio)); max_addrs = DEF_ADDRS_PER_INODE; } else { base = 0; @@ -1177,7 +1180,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - *nofs = ofs_of_node(&node_folio->page); + *nofs = ofs_of_node(node_folio); source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node); f2fs_folio_put(node_folio, true); @@ -1249,7 +1252,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) } got_it: /* read folio */ - fio.page = &folio->page; + fio.folio = folio; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; /* @@ -1353,7 +1356,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_out; /* read page */ - fio.page = &folio->page; + fio.folio = folio; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) @@ -1473,7 +1476,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } folio_mark_dirty(folio); - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1483,7 +1486,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, @@ -1499,11 +1502,11 @@ retry: f2fs_remove_dirty_inode(inode); } - set_page_private_gcing(&folio->page); + folio_set_f2fs_gcing(folio); err = f2fs_do_write_data_page(&fio); if (err) { - clear_page_private_gcing(&folio->page); + folio_clear_f2fs_gcing(folio); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; @@ -1749,7 +1752,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, !has_enough_free_blocks(sbi, sbi->gc_thread->boost_zoned_gc_percent)) window_granularity *= - BOOST_GC_MULTIPLE; + sbi->gc_thread->boost_gc_multiple; end_segno = start_segno + window_granularity; } @@ -1891,6 +1894,7 @@ gc_more: /* Let's run FG_GC, if we don't have enough space. */ if (has_not_enough_free_secs(sbi, 0, 0)) { gc_type = FG_GC; + gc_control->one_time = false; /* * For example, if there are many prefree_segments below given @@ -2064,7 +2068,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, segno))) + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 5c1eaf55e127..24e8b1c27acc 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -68,6 +68,8 @@ struct f2fs_gc_kthread { unsigned int no_zoned_gc_percent; unsigned int boost_zoned_gc_percent; unsigned int valid_thresh_ratio; + unsigned int boost_gc_multiple; + unsigned int boost_gc_greedy; }; struct gc_inode_list { @@ -194,6 +196,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) static inline bool need_to_boost_gc(struct f2fs_sb_info *sbi) { if (f2fs_sb_has_blkzoned(sbi)) - return !has_enough_free_blocks(sbi, LIMIT_BOOST_ZONED_GC); + return !has_enough_free_blocks(sbi, + sbi->gc_thread->boost_zoned_gc_percent); return has_enough_invalid_blocks(sbi); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 901c630685ce..58ac831ef704 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -33,9 +33,9 @@ bool f2fs_may_inline_data(struct inode *inode) return !f2fs_post_read_required(inode); } -static bool inode_has_blocks(struct inode *inode, struct page *ipage) +static bool inode_has_blocks(struct inode *inode, struct folio *ifolio) { - struct f2fs_inode *ri = F2FS_INODE(ipage); + struct f2fs_inode *ri = F2FS_INODE(ifolio); int i; if (F2FS_HAS_BLOCKS(inode)) @@ -48,12 +48,12 @@ static bool inode_has_blocks(struct inode *inode, struct page *ipage) return false; } -bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage) +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio) { if (!f2fs_has_inline_data(inode)) return false; - if (inode_has_blocks(inode, ipage)) + if (inode_has_blocks(inode, ifolio)) return false; if (!support_inline_data(inode)) @@ -150,7 +150,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .io_type = FS_DATA_IO, }; @@ -206,7 +206,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) /* clear inline data and flag after data writeback */ f2fs_truncate_inline_inode(dn->inode, dn->inode_folio, 0); - clear_page_private_inline(&dn->inode_folio->page); + folio_clear_f2fs_inline(dn->inode_folio); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -286,7 +286,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_page_private_inline(&ifolio->page); + folio_clear_f2fs_inline(ifolio); f2fs_folio_put(ifolio, 1); return 0; } @@ -305,8 +305,8 @@ int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio) * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ - if (IS_INODE(&nfolio->page)) - ri = F2FS_INODE(&nfolio->page); + if (IS_INODE(nfolio)) + ri = F2FS_INODE(nfolio); if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { @@ -825,7 +825,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ifolio) - - (char *)F2FS_INODE(&ifolio->page); + (char *)F2FS_INODE(ifolio); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); out: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 083d52a42bfb..8c4eafe9ffac 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -108,7 +108,7 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio) f2fs_folio_wait_writeback(ifolio, NODE, true, true); set_inode_flag(inode, FI_DATA_EXIST); - set_raw_inline(inode, F2FS_INODE(&ifolio->page)); + set_raw_inline(inode, F2FS_INODE(ifolio)); folio_mark_dirty(ifolio); return; } @@ -116,14 +116,15 @@ static void __recover_inline_status(struct inode *inode, struct folio *ifolio) return; } -static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +static +bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_inode *ri = &F2FS_NODE(page)->i; + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; if (!f2fs_sb_has_inode_chksum(sbi)) return false; - if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), @@ -133,9 +134,9 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page return true; } -static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_node *node = F2FS_NODE(folio); struct f2fs_inode *ri = &node->i; __le32 ino = node->footer.ino; __le32 gen = ri->i_generation; @@ -164,34 +165,34 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio) return true; #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_enable_inode_chksum(sbi, &folio->page)) + if (!f2fs_enable_inode_chksum(sbi, folio)) #else - if (!f2fs_enable_inode_chksum(sbi, &folio->page) || + if (!f2fs_enable_inode_chksum(sbi, folio) || folio_test_dirty(folio) || folio_test_writeback(folio)) #endif return true; - ri = &F2FS_NODE(&folio->page)->i; + ri = &F2FS_NODE(folio)->i; provided = le32_to_cpu(ri->i_inode_checksum); - calculated = f2fs_inode_chksum(sbi, &folio->page); + calculated = f2fs_inode_chksum(sbi, folio); if (provided != calculated) f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", - folio->index, ino_of_node(&folio->page), + folio->index, ino_of_node(folio), provided, calculated); return provided == calculated; } -void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio) { - struct f2fs_inode *ri = &F2FS_NODE(page)->i; + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; - if (!f2fs_enable_inode_chksum(sbi, page)) + if (!f2fs_enable_inode_chksum(sbi, folio)) return; - ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, folio)); } static bool sanity_check_compress_inode(struct inode *inode, @@ -266,28 +267,28 @@ err_level: return false; } -static bool sanity_check_inode(struct inode *inode, struct page *node_page) +static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_inode *ri = F2FS_INODE(node_page); + struct f2fs_inode *ri = F2FS_INODE(node_folio); unsigned long long iblocks; - iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks); if (!iblocks) { f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", __func__, inode->i_ino, iblocks); return false; } - if (ino_of_node(node_page) != nid_of_node(node_page)) { + if (ino_of_node(node_folio) != nid_of_node(node_folio)) { f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", __func__, inode->i_ino, - ino_of_node(node_page), nid_of_node(node_page)); + ino_of_node(node_folio), nid_of_node(node_folio)); return false; } - if (ino_of_node(node_page) == fi->i_xattr_nid) { + if (ino_of_node(node_folio) == fi->i_xattr_nid) { f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.", __func__, inode->i_ino, fi->i_xattr_nid); return false; @@ -354,7 +355,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_sanity_check_inline_data(inode, node_page)) { + if (f2fs_sanity_check_inline_data(inode, node_folio)) { f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; @@ -419,7 +420,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_folio)) return PTR_ERR(node_folio); - ri = F2FS_INODE(&node_folio->page); + ri = F2FS_INODE(node_folio); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -469,7 +470,7 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } - if (!sanity_check_inode(inode, &node_folio->page)) { + if (!sanity_check_inode(inode, node_folio)) { f2fs_folio_put(node_folio, true); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); @@ -481,9 +482,9 @@ static int do_read_inode(struct inode *inode) __recover_inline_status(inode, node_folio); /* try to recover cold bit for non-dir inode */ - if (!S_ISDIR(inode->i_mode) && !is_cold_node(&node_folio->page)) { + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_folio)) { f2fs_folio_wait_writeback(node_folio, NODE, true, true); - set_cold_node(&node_folio->page, false); + set_cold_node(node_folio, false); folio_mark_dirty(node_folio); } @@ -531,7 +532,7 @@ static int do_read_inode(struct inode *inode) init_idisk_time(inode); - if (!sanity_check_extent_cache(inode, &node_folio->page)) { + if (!sanity_check_extent_cache(inode, node_folio)) { f2fs_folio_put(node_folio, true); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; @@ -669,7 +670,7 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio) f2fs_inode_synced(inode); - ri = F2FS_INODE(&node_folio->page); + ri = F2FS_INODE(node_folio); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = fi->i_advise; @@ -748,11 +749,11 @@ void f2fs_update_inode(struct inode *inode, struct folio *node_folio) /* deleted inode */ if (inode->i_nlink == 0) - clear_page_private_inline(&node_folio->page); + folio_clear_f2fs_inline(node_folio); init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS - f2fs_inode_chksum_set(F2FS_I_SB(inode), &node_folio->page); + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_folio); #endif } @@ -820,7 +821,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } -static void f2fs_remove_donate_inode(struct inode *inode) +void f2fs_remove_donate_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -933,6 +934,19 @@ retry: f2fs_update_inode_page(inode); if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + /* + * If both f2fs_truncate() and f2fs_update_inode_page() failed + * due to fuzzed corrupted inode, call f2fs_inode_synced() to + * avoid triggering later f2fs_bug_on(). + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(sbi, + "f2fs_evict_inode: inode is dirty, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } } if (freeze_protected) sb_end_intwrite(inode->i_sb); @@ -949,8 +963,12 @@ no_delete: if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); - else - f2fs_inode_synced(inode); + + /* + * anyway, it needs to remove the inode from sbi->inode_list[DIRTY_META] + * list to avoid UAF in f2fs_sync_inode_meta() during checkpoint. + */ + f2fs_inode_synced(inode); /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ if (inode->i_ino) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 07e333ee21b7..b882771e4699 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1298,19 +1298,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page; + struct folio *folio; const char *target; if (!dentry) return ERR_PTR(-ECHILD); - page = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); + folio = read_mapping_folio(inode->i_mapping, 0, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); - target = fscrypt_get_symlink(inode, page_address(page), + target = fscrypt_get_symlink(inode, folio_address(folio), inode->i_sb->s_blocksize, done); - put_page(page); + folio_put(folio); return target; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bfe104db284e..27743b93e186 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -135,7 +135,7 @@ static struct folio *get_current_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) return f2fs_get_meta_folio_retry(sbi, current_nat_addr(sbi, nid)); } -static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +static struct folio *get_next_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) { struct folio *src_folio; struct folio *dst_folio; @@ -149,7 +149,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_folio = get_current_nat_folio(sbi, nid); if (IS_ERR(src_folio)) - return &src_folio->page; + return src_folio; dst_folio = f2fs_grab_meta_folio(sbi, dst_off); f2fs_bug_on(sbi, folio_test_dirty(src_folio)); @@ -161,7 +161,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) set_to_next_nat(nm_i, nid); - return &dst_folio->page; + return dst_folio; } static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, @@ -185,7 +185,7 @@ static void __free_nat_entry(struct nat_entry *e) /* must be locked by nat_tree_lock */ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, - struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail, bool init_dirty) { if (no_fail) f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); @@ -195,6 +195,12 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, if (raw_ne) node_info_from_raw_nat(&ne->ni, raw_ne); + if (init_dirty) { + INIT_LIST_HEAD(&ne->list); + nm_i->nat_cnt[TOTAL_NAT]++; + return ne; + } + spin_lock(&nm_i->nat_list_lock); list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock); @@ -204,14 +210,17 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, return ne; } -static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n, bool for_dirty) { struct nat_entry *ne; ne = radix_tree_lookup(&nm_i->nat_root, n); - /* for recent accessed nat entry, move it to tail of lru list */ - if (ne && !get_nat_flag(ne, IS_DIRTY)) { + /* + * for recent accessed nat entry which will not be dirtied soon + * later, move it to tail of lru list. + */ + if (ne && !get_nat_flag(ne, IS_DIRTY) && !for_dirty) { spin_lock(&nm_i->nat_list_lock); if (!list_empty(&ne->list)) list_move_tail(&ne->list, &nm_i->nat_entries); @@ -256,7 +265,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, } static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry *ne, bool init_dirty) { struct nat_entry_set *head; bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; @@ -279,7 +288,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, goto refresh_list; nm_i->nat_cnt[DIRTY_NAT]++; - nm_i->nat_cnt[RECLAIMABLE_NAT]--; + if (!init_dirty) + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -312,8 +322,7 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio) { - return is_node_folio(folio) && IS_DNODE(&folio->page) && - is_cold_node(&folio->page); + return is_node_folio(folio) && IS_DNODE(folio) && is_cold_node(folio); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -384,7 +393,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) bool need = false; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) @@ -401,7 +410,7 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) bool is_cp = true; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; f2fs_up_read(&nm_i->nat_tree_lock); @@ -415,7 +424,7 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) bool need_update = true; f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ino); + e = __lookup_nat_cache(nm_i, ino, false); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) @@ -440,9 +449,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, return; f2fs_down_write(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (!e) - e = __init_nat_entry(nm_i, new, ne, false); + e = __init_nat_entry(nm_i, new, ne, false, false); else f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -459,11 +468,13 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); + bool init_dirty = false; f2fs_down_write(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ni->nid); + e = __lookup_nat_cache(nm_i, ni->nid, true); if (!e) { - e = __init_nat_entry(nm_i, new, NULL, true); + init_dirty = true; + e = __init_nat_entry(nm_i, new, NULL, true, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -499,11 +510,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, nat_set_blkaddr(e, new_blkaddr); if (!__is_valid_data_blkaddr(new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); - __set_nat_cache_dirty(nm_i, e); + __set_nat_cache_dirty(nm_i, e, init_dirty); /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) - e = __lookup_nat_cache(nm_i, ni->ino); + e = __lookup_nat_cache(nm_i, ni->ino, false); if (e) { if (fsync_done && ni->nid == ni->ino) set_nat_flag(e, HAS_FSYNCED_INODE, true); @@ -555,20 +566,24 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry ne; struct nat_entry *e; pgoff_t index; - block_t blkaddr; int i; + bool need_cache = true; ni->flag = 0; ni->nid = nid; retry: /* Check nat cache */ f2fs_down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + e = __lookup_nat_cache(nm_i, nid, false); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); f2fs_up_read(&nm_i->nat_tree_lock); + if (IS_ENABLED(CONFIG_F2FS_CHECK_FS)) { + need_cache = false; + goto sanity_check; + } return 0; } @@ -594,7 +609,7 @@ retry: up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); - goto cache; + goto sanity_check; } /* Fill node_info from nat page */ @@ -609,14 +624,23 @@ retry: ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_folio_put(folio, true); -cache: - blkaddr = le32_to_cpu(ne.block_addr); - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) - return -EFAULT; +sanity_check: + if (__is_valid_data_blkaddr(ni->blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni->blk_addr, + DATA_GENERIC_ENHANCE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "f2fs_get_node_info of %pS: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + __builtin_return_address(0), + ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; + } /* cache nat entry */ - cache_nat_entry(sbi, nid, &ne); + if (need_cache) + cache_nat_entry(sbi, nid, &ne); return 0; } @@ -636,7 +660,7 @@ static void f2fs_ra_node_pages(struct folio *parent, int start, int n) end = start + n; end = min(end, (int)NIDS_PER_BLOCK); for (i = start; i < end; i++) { - nid = get_nid(&parent->page, i, false); + nid = get_nid(parent, i, false); f2fs_ra_node_page(sbi, nid); } @@ -795,7 +819,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) parent = nfolio[0]; if (level != 0) - nids[1] = get_nid(&parent->page, offset[0], true); + nids[1] = get_nid(parent, offset[0], true); dn->inode_folio = nfolio[0]; dn->inode_folio_locked = true; @@ -803,6 +827,16 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) for (i = 1; i <= level; i++) { bool done = false; + if (nids[i] && nids[i] == dn->inode->i_ino) { + err = -EFSCORRUPTED; + f2fs_err_ratelimited(sbi, + "inode mapping table is corrupted, run fsck to fix it, " + "ino:%lu, nid:%u, level:%d, offset:%d", + dn->inode->i_ino, nids[i], level, offset[level]); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto release_pages; + } + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ if (!f2fs_alloc_nid(sbi, &(nids[i]))) { @@ -846,7 +880,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (i < level) { parent = nfolio[i]; - nids[i + 1] = get_nid(&parent->page, offset[i], false); + nids[i + 1] = get_nid(parent, offset[i], false); } } dn->nid = nids[level]; @@ -961,9 +995,9 @@ static int truncate_dnode(struct dnode_of_data *dn) else if (IS_ERR(folio)) return PTR_ERR(folio); - if (IS_INODE(&folio->page) || ino_of_node(&folio->page) != dn->inode->i_ino) { + if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) { f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", - dn->inode->i_ino, dn->nid, ino_of_node(&folio->page)); + dn->inode->i_ino, dn->nid, ino_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); f2fs_folio_put(folio, true); @@ -1007,7 +1041,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, f2fs_ra_node_pages(folio, ofs, NIDS_PER_BLOCK); - rn = F2FS_NODE(&folio->page); + rn = F2FS_NODE(folio); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -1070,7 +1104,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, int i; int idx = depth - 2; - nid[0] = get_nid(&dn->inode_folio->page, offset[0], true); + nid[0] = get_nid(dn->inode_folio, offset[0], true); if (!nid[0]) return 0; @@ -1083,14 +1117,14 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, idx = i - 1; goto fail; } - nid[i + 1] = get_nid(&folios[i]->page, offset[i + 1], false); + nid[i + 1] = get_nid(folios[i], offset[i + 1], false); } f2fs_ra_node_pages(folios[idx], offset[idx + 1], NIDS_PER_BLOCK); /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { - child_nid = get_nid(&folios[idx]->page, i, false); + child_nid = get_nid(folios[idx], i, false); if (!child_nid) continue; dn->nid = child_nid; @@ -1159,7 +1193,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) set_new_dnode(&dn, inode, folio, NULL, 0); folio_unlock(folio); - ri = F2FS_INODE(&folio->page); + ri = F2FS_INODE(folio); switch (level) { case 0: case 1: @@ -1188,7 +1222,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = get_nid(&folio->page, offset[0], true); + dn.nid = get_nid(folio, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1220,7 +1254,7 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { + if (offset[1] == 0 && get_nid(folio, offset[0], true)) { folio_lock(folio); BUG_ON(!is_node_folio(folio)); set_nid(folio, offset[0], 0, true); @@ -1367,8 +1401,8 @@ struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs) set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_folio_wait_writeback(folio, NODE, true, true); - fill_node_footer(&folio->page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(&folio->page, S_ISDIR(dn->inode->i_mode)); + fill_node_footer(folio, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(folio, S_ISDIR(dn->inode->i_mode)); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); if (folio_mark_dirty(folio)) @@ -1400,7 +1434,7 @@ static int read_node_folio(struct folio *folio, blk_opf_t op_flags) .type = NODE, .op = REQ_OP_READ, .op_flags = op_flags, - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, }; int err; @@ -1462,17 +1496,15 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - struct page *page = &folio->page; - - if (unlikely(nid != nid_of_node(page) || - (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + if (unlikely(nid != nid_of_node(folio) || + (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(page))) || + !f2fs_has_xattr_block(ofs_of_node(folio))) || time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(page), ino_of_node(page), - ofs_of_node(page), cpver_of_node(page), + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), next_blkaddr_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); @@ -1553,7 +1585,7 @@ struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid) static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start) { struct f2fs_sb_info *sbi = F2FS_F_SB(parent); - nid_t nid = get_nid(&parent->page, start, false); + nid_t nid = get_nid(parent, start, false); return __get_node_folio(sbi, nid, parent, start, NODE_TYPE_REGULAR); } @@ -1618,9 +1650,9 @@ static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) return ERR_PTR(-EIO); } - if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) + if (!IS_DNODE(folio) || !is_cold_node(folio)) continue; - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) continue; folio_lock(folio); @@ -1630,7 +1662,7 @@ continue_unlock: folio_unlock(folio); continue; } - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) goto continue_unlock; if (!folio_test_dirty(folio)) { @@ -1660,11 +1692,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, - .ino = ino_of_node(&folio->page), + .ino = ino_of_node(folio), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), - .page = &folio->page, + .folio = folio, .encrypted_page = NULL, .submitted = 0, .io_type = io_type, @@ -1689,11 +1721,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && wbc->sync_mode == WB_SYNC_NONE && - IS_DNODE(&folio->page) && is_cold_node(&folio->page)) + IS_DNODE(folio) && is_cold_node(folio)) goto redirty_out; /* get old block addr of this node page */ - nid = nid_of_node(&folio->page); + nid = nid_of_node(folio); f2fs_bug_on(sbi, folio->index != nid); if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) @@ -1731,7 +1763,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(&folio->page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio)); dec_page_count(sbi, F2FS_DIRTY_NODES); f2fs_up_read(&sbi->node_write); @@ -1827,9 +1859,9 @@ retry: goto out; } - if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) + if (!IS_DNODE(folio) || !is_cold_node(folio)) continue; - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) continue; folio_lock(folio); @@ -1839,7 +1871,7 @@ continue_unlock: folio_unlock(folio); continue; } - if (ino_of_node(&folio->page) != ino) + if (ino_of_node(folio) != ino) goto continue_unlock; if (!folio_test_dirty(folio) && folio != last_folio) { @@ -1849,17 +1881,17 @@ continue_unlock: f2fs_folio_wait_writeback(folio, NODE, true, true); - set_fsync_mark(&folio->page, 0); - set_dentry_mark(&folio->page, 0); + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); if (!atomic || folio == last_folio) { - set_fsync_mark(&folio->page, 1); + set_fsync_mark(folio, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode(inode, folio); - set_dentry_mark(&folio->page, + set_dentry_mark(folio, f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ @@ -1935,7 +1967,7 @@ static bool flush_dirty_inode(struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct inode *inode; - nid_t ino = ino_of_node(&folio->page); + nid_t ino = ino_of_node(folio); inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); if (!inode) @@ -1964,7 +1996,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(&folio->page)) + if (!IS_INODE(folio)) continue; folio_lock(folio); @@ -1975,10 +2007,10 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) goto unlock; /* flush inline_data, if it's async context. */ - if (page_private_inline(&folio->page)) { - clear_page_private_inline(&folio->page); + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); folio_unlock(folio); - flush_inline_data(sbi, ino_of_node(&folio->page)); + flush_inline_data(sbi, ino_of_node(folio)); continue; } unlock: @@ -2027,13 +2059,13 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(&folio->page)) + if (step == 0 && IS_DNODE(folio)) continue; - if (step == 1 && (!IS_DNODE(&folio->page) || - is_cold_node(&folio->page))) + if (step == 1 && (!IS_DNODE(folio) || + is_cold_node(folio))) continue; - if (step == 2 && (!IS_DNODE(&folio->page) || - !is_cold_node(&folio->page))) + if (step == 2 && (!IS_DNODE(folio) || + !is_cold_node(folio))) continue; lock_node: if (wbc->sync_mode == WB_SYNC_ALL) @@ -2057,15 +2089,15 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (page_private_inline(&folio->page)) { - clear_page_private_inline(&folio->page); + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); folio_unlock(folio); - flush_inline_data(sbi, ino_of_node(&folio->page)); + flush_inline_data(sbi, ino_of_node(folio)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) + if (IS_INODE(folio) && flush_dirty_inode(folio)) goto lock_node; write_node: f2fs_folio_wait_writeback(folio, NODE, true, true); @@ -2073,8 +2105,8 @@ write_node: if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - set_fsync_mark(&folio->page, 0); - set_dentry_mark(&folio->page, 0); + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { @@ -2201,12 +2233,12 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); #ifdef CONFIG_F2FS_CHECK_FS - if (IS_INODE(&folio->page)) - f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); + if (IS_INODE(folio)) + f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio); #endif if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); - set_page_private_reference(&folio->page); + folio_set_f2fs_reference(folio); return true; } return false; @@ -2351,7 +2383,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, * - __remove_nid_from_list(PREALLOC_NID) * - __insert_nid_to_list(FREE_NID) */ - ne = __lookup_nat_cache(nm_i, nid); + ne = __lookup_nat_cache(nm_i, nid, false); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) goto err_out; @@ -2714,7 +2746,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio) if (IS_ERR(ifolio)) return PTR_ERR(ifolio); - ri = F2FS_INODE(&folio->page); + ri = F2FS_INODE(folio); if (ri->i_inline & F2FS_INLINE_XATTR) { if (!f2fs_has_inline_xattr(inode)) { set_inode_flag(inode, FI_INLINE_XATTR); @@ -2740,7 +2772,7 @@ update_inode: return 0; } -int f2fs_recover_xattr_data(struct inode *inode, struct page *page) +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; @@ -2778,8 +2810,8 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - if (page) { - memcpy(F2FS_NODE(&xfolio->page), F2FS_NODE(page), + if (folio) { + memcpy(F2FS_NODE(xfolio), F2FS_NODE(folio), VALID_XATTR_BLOCK_SIZE); folio_mark_dirty(xfolio); } @@ -2788,10 +2820,10 @@ recover_xnid: return 0; } -int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio) { struct f2fs_inode *src, *dst; - nid_t ino = ino_of_node(page); + nid_t ino = ino_of_node(folio); struct node_info old_ni, new_ni; struct folio *ifolio; int err; @@ -2814,11 +2846,11 @@ retry: if (!folio_test_uptodate(ifolio)) folio_mark_uptodate(ifolio); - fill_node_footer(&ifolio->page, ino, ino, 0, true); - set_cold_node(&ifolio->page, false); + fill_node_footer(ifolio, ino, ino, 0, true); + set_cold_node(ifolio, false); - src = F2FS_INODE(page); - dst = F2FS_INODE(&ifolio->page); + src = F2FS_INODE(folio); + dst = F2FS_INODE(ifolio); memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; @@ -2884,7 +2916,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, if (IS_ERR(folio)) return PTR_ERR(folio); - rn = F2FS_NODE(&folio->page); + rn = F2FS_NODE(folio); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; @@ -2904,6 +2936,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; int i; + bool init_dirty; down_write(&curseg->journal_rwsem); for (i = 0; i < nats_in_cursum(journal); i++) { @@ -2914,12 +2947,15 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) if (f2fs_check_nid_range(sbi, nid)) continue; + init_dirty = false; + raw_ne = nat_in_journal(journal, i); - ne = __lookup_nat_cache(nm_i, nid); + ne = __lookup_nat_cache(nm_i, nid, true); if (!ne) { + init_dirty = true; ne = __alloc_nat_entry(sbi, nid, true); - __init_nat_entry(nm_i, ne, &raw_ne, true); + __init_nat_entry(nm_i, ne, &raw_ne, true, true); } /* @@ -2934,7 +2970,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->nid_list_lock); } - __set_nat_cache_dirty(nm_i, ne); + __set_nat_cache_dirty(nm_i, ne, init_dirty); } update_nats_in_cursum(journal, -i); up_write(&curseg->journal_rwsem); @@ -2959,11 +2995,10 @@ add_out: } static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, - struct page *page) + const struct f2fs_nat_block *nat_blk) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; - struct f2fs_nat_block *nat_blk = page_address(page); int valid = 0; int i = 0; @@ -3000,7 +3035,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, bool to_journal = true; struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; - struct page *page = NULL; + struct folio *folio = NULL; /* * there are two steps to flush nat entries: @@ -3014,11 +3049,11 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { down_write(&curseg->journal_rwsem); } else { - page = get_next_nat_page(sbi, start_nid); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = get_next_nat_folio(sbi, start_nid); + if (IS_ERR(folio)) + return PTR_ERR(folio); - nat_blk = page_address(page); + nat_blk = folio_address(folio); f2fs_bug_on(sbi, !nat_blk); } @@ -3054,8 +3089,8 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); - f2fs_put_page(page, 1); + __update_nat_bits(sbi, start_nid, nat_blk); + f2fs_folio_put(folio, true); } /* Allow dirty nats by node block allocation in write_begin */ @@ -3395,10 +3430,10 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) } kvfree(nm_i->free_nid_count); - kvfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bitmap); kvfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS - kvfree(nm_i->nat_bitmap_mir); + kfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 1446c433b3ec..030390543b54 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -31,7 +31,7 @@ /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 -/* control total # of node writes used for roll-fowrad recovery */ +/* control total # of node writes used for roll-forward recovery */ #define DEF_RF_NODE_BLOCKS 0 /* vector size for gang look-up from nat cache that consists of radix tree */ @@ -243,41 +243,41 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) #endif } -static inline nid_t ino_of_node(struct page *node_page) +static inline nid_t ino_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.ino); } -static inline nid_t nid_of_node(struct page *node_page) +static inline nid_t nid_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.nid); } -static inline unsigned int ofs_of_node(const struct page *node_page) +static inline unsigned int ofs_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } -static inline __u64 cpver_of_node(struct page *node_page) +static inline __u64 cpver_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le64_to_cpu(rn->footer.cp_ver); } -static inline block_t next_blkaddr_of_node(struct folio *node_folio) +static inline block_t next_blkaddr_of_node(const struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(&node_folio->page); + struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.next_blkaddr); } -static inline void fill_node_footer(struct page *page, nid_t nid, +static inline void fill_node_footer(const struct folio *folio, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int old_flag = 0; if (reset) @@ -293,17 +293,18 @@ static inline void fill_node_footer(struct page *page, nid_t nid, (old_flag & OFFSET_BIT_MASK)); } -static inline void copy_node_footer(struct page *dst, struct page *src) +static inline void copy_node_footer(const struct folio *dst, + const struct folio *src) { struct f2fs_node *src_rn = F2FS_NODE(src); struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } -static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +static inline void fill_node_footer_blkaddr(struct folio *folio, block_t blkaddr) { - struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); + struct f2fs_node *rn = F2FS_NODE(folio); __u64 cp_ver = cur_cp_version(ckpt); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) @@ -313,19 +314,19 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline bool is_recoverable_dnode(struct page *page) +static inline bool is_recoverable_dnode(const struct folio *folio) { - struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); __u64 cp_ver = cur_cp_version(ckpt); /* Don't care crc part, if fsck.f2fs sets it. */ if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) - return (cp_ver << 32) == (cpver_of_node(page) << 32); + return (cp_ver << 32) == (cpver_of_node(folio) << 32); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); - return cp_ver == cpver_of_node(page); + return cp_ver == cpver_of_node(folio); } /* @@ -349,9 +350,9 @@ static inline bool is_recoverable_dnode(struct page *page) * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ -static inline bool IS_DNODE(const struct page *node_page) +static inline bool IS_DNODE(const struct folio *node_folio) { - unsigned int ofs = ofs_of_node(node_page); + unsigned int ofs = ofs_of_node(node_folio); if (f2fs_has_xattr_block(ofs)) return true; @@ -369,7 +370,7 @@ static inline bool IS_DNODE(const struct page *node_page) static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) { - struct f2fs_node *rn = F2FS_NODE(&folio->page); + struct f2fs_node *rn = F2FS_NODE(folio); f2fs_folio_wait_writeback(folio, NODE, true, true); @@ -380,9 +381,9 @@ static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) return folio_mark_dirty(folio); } -static inline nid_t get_nid(struct page *p, int off, bool i) +static inline nid_t get_nid(const struct folio *folio, int off, bool i) { - struct f2fs_node *rn = F2FS_NODE(p); + struct f2fs_node *rn = F2FS_NODE(folio); if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); @@ -396,19 +397,19 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold data pages in page cache */ -static inline int is_node(const struct page *page, int type) +static inline int is_node(const struct folio *folio, int type) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); return le32_to_cpu(rn->footer.flag) & BIT(type); } -#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) -#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) -#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) +#define is_cold_node(folio) is_node(folio, COLD_BIT_SHIFT) +#define is_fsync_dnode(folio) is_node(folio, FSYNC_BIT_SHIFT) +#define is_dent_dnode(folio) is_node(folio, DENT_BIT_SHIFT) -static inline void set_cold_node(struct page *page, bool is_dir) +static inline void set_cold_node(const struct folio *folio, bool is_dir) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (is_dir) @@ -418,9 +419,9 @@ static inline void set_cold_node(struct page *page, bool is_dir) rn->footer.flag = cpu_to_le32(flag); } -static inline void set_mark(struct page *page, int mark, int type) +static inline void set_mark(struct folio *folio, int mark, int type) { - struct f2fs_node *rn = F2FS_NODE(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) flag |= BIT(type); @@ -429,8 +430,8 @@ static inline void set_mark(struct page *page, int mark, int type) rn->footer.flag = cpu_to_le32(flag); #ifdef CONFIG_F2FS_CHECK_FS - f2fs_inode_chksum_set(F2FS_P_SB(page), page); + f2fs_inode_chksum_set(F2FS_F_SB(folio), folio); #endif } -#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) -#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) +#define set_dentry_mark(folio, mark) set_mark(folio, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(folio, mark) set_mark(folio, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 51ebed4e1521..4cb3a91801b4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -157,10 +157,10 @@ static int init_recovered_filename(const struct inode *dir, return 0; } -static int recover_dentry(struct inode *inode, struct page *ipage, +static int recover_dentry(struct inode *inode, struct folio *ifolio, struct list_head *dir_list) { - struct f2fs_inode *raw_inode = F2FS_INODE(ipage); + struct f2fs_inode *raw_inode = F2FS_INODE(ifolio); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; struct f2fs_filename fname; @@ -233,14 +233,14 @@ out: else name = raw_inode->i_name; f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), name, + __func__, ino_of_node(ifolio), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } -static int recover_quota_data(struct inode *inode, struct page *page) +static int recover_quota_data(struct inode *inode, struct folio *folio) { - struct f2fs_inode *raw = F2FS_INODE(page); + struct f2fs_inode *raw = F2FS_INODE(folio); struct iattr attr; uid_t i_uid = le32_to_cpu(raw->i_uid); gid_t i_gid = le32_to_cpu(raw->i_gid); @@ -277,16 +277,16 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) clear_inode_flag(inode, FI_DATA_EXIST); } -static int recover_inode(struct inode *inode, struct page *page) +static int recover_inode(struct inode *inode, struct folio *folio) { - struct f2fs_inode *raw = F2FS_INODE(page); + struct f2fs_inode *raw = F2FS_INODE(folio); struct f2fs_inode_info *fi = F2FS_I(inode); char *name; int err; inode->i_mode = le16_to_cpu(raw->i_mode); - err = recover_quota_data(inode, page); + err = recover_quota_data(inode, folio); if (err) return err; @@ -333,10 +333,10 @@ static int recover_inode(struct inode *inode, struct page *page) if (file_enc_name(inode)) name = "<encrypted>"; else - name = F2FS_INODE(page)->i_name; + name = F2FS_INODE(folio)->i_name; f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x", - ino_of_node(page), name, raw->i_inline); + ino_of_node(folio), name, raw->i_inline); return 0; } @@ -375,7 +375,7 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, if (IS_ERR(folio)) return PTR_ERR(folio); - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); *is_detecting = false; return 0; @@ -424,22 +424,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, break; } - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } - if (!is_fsync_dnode(&folio->page)) + if (!is_fsync_dnode(folio)) goto next; - entry = get_fsync_inode(head, ino_of_node(&folio->page)); + entry = get_fsync_inode(head, ino_of_node(folio)); if (!entry) { bool quota_inode = false; if (!check_only && - IS_INODE(&folio->page) && - is_dent_dnode(&folio->page)) { - err = f2fs_recover_inode_page(sbi, &folio->page); + IS_INODE(folio) && + is_dent_dnode(folio)) { + err = f2fs_recover_inode_page(sbi, folio); if (err) { f2fs_folio_put(folio, true); break; @@ -451,7 +451,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(&folio->page), + entry = add_fsync_inode(sbi, head, ino_of_node(folio), quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); @@ -463,7 +463,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, } entry->blkaddr = blkaddr; - if (IS_INODE(&folio->page) && is_dent_dnode(&folio->page)) + if (IS_INODE(folio) && is_dent_dnode(folio)) entry->last_dentry = blkaddr; next: /* check next segment */ @@ -527,7 +527,7 @@ got_it: nid = le32_to_cpu(sum.nid); ofs_in_node = le16_to_cpu(sum.ofs_in_node); - max_addrs = ADDRS_PER_PAGE(&dn->node_folio->page, dn->inode); + max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); @@ -552,8 +552,8 @@ got_it: if (IS_ERR(node_folio)) return PTR_ERR(node_folio); - offset = ofs_of_node(&node_folio->page); - ino = ino_of_node(&node_folio->page); + offset = ofs_of_node(node_folio); + ino = ino_of_node(node_folio); f2fs_folio_put(node_folio, true); if (ino != dn->inode->i_ino) { @@ -624,16 +624,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, { struct dnode_of_data dn; struct node_info ni; - unsigned int start, end; + unsigned int start = 0, end = 0, index; int err = 0, recovered = 0; /* step 1: recover xattr */ - if (IS_INODE(&folio->page)) { + if (IS_INODE(folio)) { err = f2fs_recover_inline_xattr(inode, folio); if (err) goto out; - } else if (f2fs_has_xattr_block(ofs_of_node(&folio->page))) { - err = f2fs_recover_xattr_data(inode, &folio->page); + } else if (f2fs_has_xattr_block(ofs_of_node(folio))) { + err = f2fs_recover_xattr_data(inode, folio); if (!err) recovered++; goto out; @@ -648,8 +648,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* step 3: recover data indices */ - start = f2fs_start_bidx_of_node(ofs_of_node(&folio->page), inode); - end = start + ADDRS_PER_PAGE(&folio->page, inode); + start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode); + end = start + ADDRS_PER_PAGE(folio, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: @@ -668,18 +668,18 @@ retry_dn: if (err) goto err; - f2fs_bug_on(sbi, ni.ino != ino_of_node(&folio->page)); + f2fs_bug_on(sbi, ni.ino != ino_of_node(folio)); - if (ofs_of_node(&dn.node_folio->page) != ofs_of_node(&folio->page)) { + if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) { f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", - inode->i_ino, ofs_of_node(&dn.node_folio->page), - ofs_of_node(&folio->page)); + inode->i_ino, ofs_of_node(dn.node_folio), + ofs_of_node(folio)); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } - for (; start < end; start++, dn.ofs_in_node++) { + for (index = start; index < end; index++, dn.ofs_in_node++) { block_t src, dest; src = f2fs_data_blkaddr(&dn); @@ -708,9 +708,9 @@ retry_dn: } if (!file_keep_isize(inode) && - (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + (i_size_read(inode) <= ((loff_t)index << PAGE_SHIFT))) f2fs_i_size_write(inode, - (loff_t)(start + 1) << PAGE_SHIFT); + (loff_t)(index + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -758,16 +758,18 @@ retry_prev: } } - copy_node_footer(&dn.node_folio->page, &folio->page); - fill_node_footer(&dn.node_folio->page, dn.nid, ni.ino, - ofs_of_node(&folio->page), false); + copy_node_footer(dn.node_folio, folio); + fill_node_footer(dn.node_folio, dn.nid, ni.ino, + ofs_of_node(folio), false); folio_mark_dirty(dn.node_folio); err: f2fs_put_dnode(&dn); out: - f2fs_notice(sbi, "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", - inode->i_ino, file_keep_isize(inode) ? "keep" : "recover", - recovered, err); + f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), " + "range (%u, %u), recovered = %d, err = %d", + inode->i_ino, nid_of_node(folio), + file_keep_isize(inode) ? "keep" : "recover", + start, end, recovered, err); return err; } @@ -778,6 +780,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, int err = 0; block_t blkaddr; unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + unsigned int recoverable_dnode = 0; + unsigned int fsynced_dnode = 0; + unsigned int total_dnode = 0; + unsigned int recovered_inode = 0; + unsigned int recovered_dentry = 0; + unsigned int recovered_dnode = 0; + + f2fs_notice(sbi, "do_recover_data: start to recover dnode"); /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -796,38 +806,43 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, break; } - if (!is_recoverable_dnode(&folio->page)) { + if (!is_recoverable_dnode(folio)) { f2fs_folio_put(folio, true); break; } + recoverable_dnode++; - entry = get_fsync_inode(inode_list, ino_of_node(&folio->page)); + entry = get_fsync_inode(inode_list, ino_of_node(folio)); if (!entry) goto next; + fsynced_dnode++; /* * inode(x) | CP | inode(x) | dnode(F) * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (IS_INODE(&folio->page)) { - err = recover_inode(entry->inode, &folio->page); + if (IS_INODE(folio)) { + err = recover_inode(entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } + recovered_inode++; } if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, &folio->page, dir_list); + err = recover_dentry(entry->inode, folio, dir_list); if (err) { f2fs_folio_put(folio, true); break; } + recovered_dentry++; } err = do_recover_data(sbi, entry->inode, folio); if (err) { f2fs_folio_put(folio, true); break; } + recovered_dnode++; if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); @@ -840,9 +855,15 @@ next: f2fs_folio_put(folio, true); f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + total_dnode++; } if (!err) err = f2fs_allocate_new_segments(sbi); + + f2fs_notice(sbi, "do_recover_data: dnode: (recoverable: %u, fsynced: %u, " + "total: %u), recovered: (inode: %u, dentry: %u, dnode: %u), err: %d", + recoverable_dnode, fsynced_dnode, total_dnode, recovered_inode, + recovered_dentry, recovered_dnode, err); return err; } @@ -855,6 +876,9 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, " + "check_only: %d", check_only); + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae1223ef648f..cc82d42ef14c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode) goto next; } - blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode), + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { @@ -455,7 +455,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } else { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, - .init_gc_type = BG_GC, + .init_gc_type = f2fs_sb_has_blkzoned(sbi) ? + FG_GC : BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, .err_gc_skipped = false, @@ -772,7 +773,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ - if (IS_CURSEG(sbi, segno)) + if (is_curseg(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) @@ -799,7 +800,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, !valid_blocks) || valid_blocks == CAP_BLKS_PER_SEC(sbi)); - if (!IS_CURSEC(sbi, secno)) + if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } @@ -838,7 +839,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, return; } - if (!IS_CURSEC(sbi, secno)) + if (!is_cursec(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } @@ -855,7 +856,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) unsigned short valid_blocks, ckpt_valid_blocks; unsigned int usable_blocks; - if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + if (segno == NULL_SEGNO || is_curseg(sbi, segno)) return; usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); @@ -888,7 +889,7 @@ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (IS_CURSEG(sbi, segno)) + if (is_curseg(sbi, segno)) continue; __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); @@ -2107,7 +2108,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!f2fs_realtime_discard_enable(sbi) || (!se->valid_blocks && - !IS_CURSEG(sbi, cpc->trim_start)) || + !is_curseg(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -2235,7 +2236,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, next: secno = GET_SEC_FROM_SEG(sbi, start); start_segno = GET_SEG_FROM_SEC(sbi, secno); - if (!IS_CURSEC(sbi, secno) && + if (!is_cursec(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), BLKS_PER_SEC(sbi)); @@ -3619,7 +3620,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(fio->page) && is_cold_node(fio->page)) + if (IS_DNODE(fio->folio) && is_cold_node(fio->folio)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; @@ -3665,8 +3666,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; - type = __get_age_segment_type(inode, - page_folio(fio->page)->index); + type = __get_age_segment_type(inode, fio->folio->index); if (type != NO_CHECK_TYPE) return type; @@ -3677,8 +3677,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); } else { - if (IS_DNODE(fio->page)) - return is_cold_node(fio->page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->folio)) + return is_cold_node(fio->folio) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } @@ -3746,7 +3746,7 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) @@ -3850,10 +3850,10 @@ skip_new_segment: up_write(&sit_i->sentry_lock); - if (page && IS_NODESEG(curseg->seg_type)) { - fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + if (folio && IS_NODESEG(curseg->seg_type)) { + fill_node_footer_blkaddr(folio, NEXT_FREE_BLKADDR(sbi, curseg)); - f2fs_inode_chksum_set(sbi, page); + f2fs_inode_chksum_set(sbi, folio); } if (fio) { @@ -3931,7 +3931,7 @@ static int log_type_to_seg_type(enum log_type type) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - struct folio *folio = page_folio(fio->page); + struct folio *folio = fio->folio; enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && @@ -3940,15 +3940,21 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); - if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, folio); + f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi, + CP_ERROR_FLAG)); goto out; } + + f2fs_bug_on(fio->sbi, !f2fs_is_valid_blkaddr_raw(fio->sbi, + fio->new_blkaddr, DATA_GENERIC_ENHANCE)); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1); @@ -3972,7 +3978,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = folio->index, .new_blkaddr = folio->index, - .page = folio_page(folio, 0), + .folio = folio, .encrypted_page = NULL, .in_list = 0, }; @@ -4100,14 +4106,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!recover_curseg) { /* for recovery flow */ - if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (se->valid_blocks == 0 && !is_curseg(sbi, segno)) { if (old_blkaddr == NULL_ADDR) type = CURSEG_COLD_DATA; else type = CURSEG_WARM_DATA; } } else { - if (IS_CURSEG(sbi, segno)) { + if (is_curseg(sbi, segno)) { /* se->type is volatile as SSR allocation */ type = __f2fs_get_curseg(sbi, segno); f2fs_bug_on(sbi, type == NO_CHECK_TYPE); @@ -4191,7 +4197,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); + f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { @@ -5143,7 +5149,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; - if (IS_CURSEC(sbi, secno)) + if (is_cursec(sbi, secno)) continue; set_bit(secno, dirty_i->dirty_secmap); } @@ -5279,7 +5285,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, * Get # of valid block of the zone. */ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); - if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); @@ -5806,9 +5812,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; - kvfree(sit_i->sit_bitmap); + kfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS - kvfree(sit_i->sit_bitmap_mir); + kfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif kfree(sit_i); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index db619fd2f51a..5e2ee5c686b1 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -34,34 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } -#define IS_CURSEG(sbi, seg) \ - (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) - -#define IS_CURSEC(sbi, secno) \ - (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ - SEGS_PER_SEC(sbi)) || \ - ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ - SEGS_PER_SEC(sbi))) - #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) @@ -318,6 +290,28 @@ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } +static inline bool is_curseg(struct f2fs_sb_info *sbi, unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (segno == CURSEG_I(sbi, i)->segno) + return true; + } + return false; +} + +static inline bool is_cursec(struct f2fs_sb_info *sbi, unsigned int secno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (secno == GET_SEC_FROM_SEG(sbi, CURSEG_I(sbi, i)->segno)) + return true; + } + return false; +} + static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, unsigned int segno) { @@ -509,7 +503,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_segments++; - if (!inmem && IS_CURSEC(sbi, secno)) + if (!inmem && is_cursec(sbi, secno)) goto unlock_out; /* check large section */ @@ -674,8 +668,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int data_blocks = 0; - if (f2fs_lfs_mode(sbi) && - unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (f2fs_lfs_mode(sbi)) { total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); @@ -684,7 +677,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, if (lower_p) *lower_p = node_secs + dent_secs + data_secs; if (upper_p) - *upper_p = node_secs + dent_secs + + *upper_p = node_secs + dent_secs + data_secs + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + (data_blocks ? 1 : 0); if (curseg_p) @@ -986,7 +979,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { - if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + if (is_cursec(sbi, secno) || (sbi->cur_victim_sec == secno)) return true; return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bbf1dad6843f..e16c4e2830c2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -27,6 +27,8 @@ #include <linux/part_stat.h> #include <linux/zstd.h> #include <linux/lz4.h> +#include <linux/ctype.h> +#include <linux/fs_parser.h> #include "f2fs.h" #include "node.h" @@ -125,29 +127,20 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, - Opt_nodiscard, Opt_noheap, Opt_heap, Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, - Opt_noinline_xattr, Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, - Opt_noinline_dentry, Opt_flush_merge, - Opt_noflush_merge, Opt_barrier, - Opt_nobarrier, Opt_fastboot, Opt_extent_cache, - Opt_noextent_cache, - Opt_noinline_data, Opt_data_flush, Opt_reserve_root, Opt_resgid, @@ -156,21 +149,13 @@ enum { Opt_fault_injection, Opt_fault_type, Opt_lazytime, - Opt_nolazytime, Opt_quota, - Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_usrjquota, Opt_grpjquota, Opt_prjjquota, - Opt_offusrjquota, - Opt_offgrpjquota, - Opt_offprjjquota, - Opt_jqfmt_vfsold, - Opt_jqfmt_vfsv0, - Opt_jqfmt_vfsv1, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -180,107 +165,209 @@ enum { Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, Opt_checkpoint_merge, - Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, - Opt_compress_extension, Opt_nocompress_extension, + Opt_compress_extension, Opt_compress_chksum, Opt_compress_mode, Opt_compress_cache, Opt_atgc, Opt_gc_merge, - Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, Opt_age_extent_cache, Opt_errors, Opt_nat_bits, + Opt_jqfmt, + Opt_checkpoint, Opt_err, }; -static match_table_t f2fs_tokens = { - {Opt_gc_background, "background_gc=%s"}, - {Opt_disable_roll_forward, "disable_roll_forward"}, - {Opt_norecovery, "norecovery"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_noheap, "no_heap"}, - {Opt_heap, "heap"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_active_logs, "active_logs=%u"}, - {Opt_disable_ext_identify, "disable_ext_identify"}, - {Opt_inline_xattr, "inline_xattr"}, - {Opt_noinline_xattr, "noinline_xattr"}, - {Opt_inline_xattr_size, "inline_xattr_size=%u"}, - {Opt_inline_data, "inline_data"}, - {Opt_inline_dentry, "inline_dentry"}, - {Opt_noinline_dentry, "noinline_dentry"}, - {Opt_flush_merge, "flush_merge"}, - {Opt_noflush_merge, "noflush_merge"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_fastboot, "fastboot"}, - {Opt_extent_cache, "extent_cache"}, - {Opt_noextent_cache, "noextent_cache"}, - {Opt_noinline_data, "noinline_data"}, - {Opt_data_flush, "data_flush"}, - {Opt_reserve_root, "reserve_root=%u"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_mode, "mode=%s"}, - {Opt_fault_injection, "fault_injection=%u"}, - {Opt_fault_type, "fault_type=%u"}, - {Opt_lazytime, "lazytime"}, - {Opt_nolazytime, "nolazytime"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_prjquota, "prjquota"}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_prjjquota, "prjjquota=%s"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_offprjjquota, "prjjquota="}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_alloc, "alloc_mode=%s"}, - {Opt_fsync, "fsync_mode=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption"}, - {Opt_inlinecrypt, "inlinecrypt"}, - {Opt_checkpoint_disable, "checkpoint=disable"}, - {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, - {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, - {Opt_checkpoint_enable, "checkpoint=enable"}, - {Opt_checkpoint_merge, "checkpoint_merge"}, - {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, - {Opt_compress_algorithm, "compress_algorithm=%s"}, - {Opt_compress_log_size, "compress_log_size=%u"}, - {Opt_compress_extension, "compress_extension=%s"}, - {Opt_nocompress_extension, "nocompress_extension=%s"}, - {Opt_compress_chksum, "compress_chksum"}, - {Opt_compress_mode, "compress_mode=%s"}, - {Opt_compress_cache, "compress_cache"}, - {Opt_atgc, "atgc"}, - {Opt_gc_merge, "gc_merge"}, - {Opt_nogc_merge, "nogc_merge"}, - {Opt_discard_unit, "discard_unit=%s"}, - {Opt_memory_mode, "memory=%s"}, - {Opt_age_extent_cache, "age_extent_cache"}, - {Opt_errors, "errors=%s"}, - {Opt_nat_bits, "nat_bits"}, +static const struct constant_table f2fs_param_background_gc[] = { + {"on", BGGC_MODE_ON}, + {"off", BGGC_MODE_OFF}, + {"sync", BGGC_MODE_SYNC}, + {} +}; + +static const struct constant_table f2fs_param_mode[] = { + {"adaptive", FS_MODE_ADAPTIVE}, + {"lfs", FS_MODE_LFS}, + {"fragment:segment", FS_MODE_FRAGMENT_SEG}, + {"fragment:block", FS_MODE_FRAGMENT_BLK}, + {} +}; + +static const struct constant_table f2fs_param_jqfmt[] = { + {"vfsold", QFMT_VFS_OLD}, + {"vfsv0", QFMT_VFS_V0}, + {"vfsv1", QFMT_VFS_V1}, + {} +}; + +static const struct constant_table f2fs_param_alloc_mode[] = { + {"default", ALLOC_MODE_DEFAULT}, + {"reuse", ALLOC_MODE_REUSE}, + {} +}; +static const struct constant_table f2fs_param_fsync_mode[] = { + {"posix", FSYNC_MODE_POSIX}, + {"strict", FSYNC_MODE_STRICT}, + {"nobarrier", FSYNC_MODE_NOBARRIER}, + {} +}; + +static const struct constant_table f2fs_param_compress_mode[] = { + {"fs", COMPR_MODE_FS}, + {"user", COMPR_MODE_USER}, + {} +}; + +static const struct constant_table f2fs_param_discard_unit[] = { + {"block", DISCARD_UNIT_BLOCK}, + {"segment", DISCARD_UNIT_SEGMENT}, + {"section", DISCARD_UNIT_SECTION}, + {} +}; + +static const struct constant_table f2fs_param_memory_mode[] = { + {"normal", MEMORY_MODE_NORMAL}, + {"low", MEMORY_MODE_LOW}, + {} +}; + +static const struct constant_table f2fs_param_errors[] = { + {"remount-ro", MOUNT_ERRORS_READONLY}, + {"continue", MOUNT_ERRORS_CONTINUE}, + {"panic", MOUNT_ERRORS_PANIC}, + {} +}; + +static const struct fs_parameter_spec f2fs_param_specs[] = { + fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), + fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), + fsparam_flag("norecovery", Opt_norecovery), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag("no_heap", Opt_noheap), + fsparam_flag("heap", Opt_heap), + fsparam_flag_no("user_xattr", Opt_user_xattr), + fsparam_flag_no("acl", Opt_acl), + fsparam_s32("active_logs", Opt_active_logs), + fsparam_flag("disable_ext_identify", Opt_disable_ext_identify), + fsparam_flag_no("inline_xattr", Opt_inline_xattr), + fsparam_s32("inline_xattr_size", Opt_inline_xattr_size), + fsparam_flag_no("inline_data", Opt_inline_data), + fsparam_flag_no("inline_dentry", Opt_inline_dentry), + fsparam_flag_no("flush_merge", Opt_flush_merge), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("fastboot", Opt_fastboot), + fsparam_flag_no("extent_cache", Opt_extent_cache), + fsparam_flag("data_flush", Opt_data_flush), + fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_gid("resgid", Opt_resgid), + fsparam_uid("resuid", Opt_resuid), + fsparam_enum("mode", Opt_mode, f2fs_param_mode), + fsparam_s32("fault_injection", Opt_fault_injection), + fsparam_u32("fault_type", Opt_fault_type), + fsparam_flag_no("lazytime", Opt_lazytime), + fsparam_flag_no("quota", Opt_quota), + fsparam_flag("usrquota", Opt_usrquota), + fsparam_flag("grpquota", Opt_grpquota), + fsparam_flag("prjquota", Opt_prjquota), + fsparam_string_empty("usrjquota", Opt_usrjquota), + fsparam_string_empty("grpjquota", Opt_grpjquota), + fsparam_string_empty("prjjquota", Opt_prjjquota), + fsparam_flag("nat_bits", Opt_nat_bits), + fsparam_enum("jqfmt", Opt_jqfmt, f2fs_param_jqfmt), + fsparam_enum("alloc_mode", Opt_alloc, f2fs_param_alloc_mode), + fsparam_enum("fsync_mode", Opt_fsync, f2fs_param_fsync_mode), + fsparam_string("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("inlinecrypt", Opt_inlinecrypt), + fsparam_string("checkpoint", Opt_checkpoint), + fsparam_flag_no("checkpoint_merge", Opt_checkpoint_merge), + fsparam_string("compress_algorithm", Opt_compress_algorithm), + fsparam_u32("compress_log_size", Opt_compress_log_size), + fsparam_string("compress_extension", Opt_compress_extension), + fsparam_string("nocompress_extension", Opt_nocompress_extension), + fsparam_flag("compress_chksum", Opt_compress_chksum), + fsparam_enum("compress_mode", Opt_compress_mode, f2fs_param_compress_mode), + fsparam_flag("compress_cache", Opt_compress_cache), + fsparam_flag("atgc", Opt_atgc), + fsparam_flag_no("gc_merge", Opt_gc_merge), + fsparam_enum("discard_unit", Opt_discard_unit, f2fs_param_discard_unit), + fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), + fsparam_flag("age_extent_cache", Opt_age_extent_cache), + fsparam_enum("errors", Opt_errors, f2fs_param_errors), + {} +}; + +/* Resort to a match_table for this interestingly formatted option */ +static match_table_t f2fs_checkpoint_tokens = { + {Opt_checkpoint_disable, "disable"}, + {Opt_checkpoint_disable_cap, "disable:%u"}, + {Opt_checkpoint_disable_cap_perc, "disable:%u%%"}, + {Opt_checkpoint_enable, "enable"}, {Opt_err, NULL}, }; +#define F2FS_SPEC_background_gc (1 << 0) +#define F2FS_SPEC_inline_xattr_size (1 << 1) +#define F2FS_SPEC_active_logs (1 << 2) +#define F2FS_SPEC_reserve_root (1 << 3) +#define F2FS_SPEC_resgid (1 << 4) +#define F2FS_SPEC_resuid (1 << 5) +#define F2FS_SPEC_mode (1 << 6) +#define F2FS_SPEC_fault_injection (1 << 7) +#define F2FS_SPEC_fault_type (1 << 8) +#define F2FS_SPEC_jqfmt (1 << 9) +#define F2FS_SPEC_alloc_mode (1 << 10) +#define F2FS_SPEC_fsync_mode (1 << 11) +#define F2FS_SPEC_checkpoint_disable_cap (1 << 12) +#define F2FS_SPEC_checkpoint_disable_cap_perc (1 << 13) +#define F2FS_SPEC_compress_level (1 << 14) +#define F2FS_SPEC_compress_algorithm (1 << 15) +#define F2FS_SPEC_compress_log_size (1 << 16) +#define F2FS_SPEC_compress_extension (1 << 17) +#define F2FS_SPEC_nocompress_extension (1 << 18) +#define F2FS_SPEC_compress_chksum (1 << 19) +#define F2FS_SPEC_compress_mode (1 << 20) +#define F2FS_SPEC_discard_unit (1 << 21) +#define F2FS_SPEC_memory_mode (1 << 22) +#define F2FS_SPEC_errors (1 << 23) + +struct f2fs_fs_context { + struct f2fs_mount_info info; + unsigned int opt_mask; /* Bits changed */ + unsigned int spec_mask; + unsigned short qname_mask; +}; + +#define F2FS_CTX_INFO(ctx) ((ctx)->info) + +static inline void ctx_set_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + ctx->info.opt |= flag; + ctx->opt_mask |= flag; +} + +static inline void ctx_clear_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + ctx->info.opt &= ~flag; + ctx->opt_mask |= flag; +} + +static inline bool ctx_test_opt(struct f2fs_fs_context *ctx, + unsigned int flag) +{ + return ctx->info.opt & flag; +} + void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, - const char *fmt, ...) + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -292,11 +379,19 @@ void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, vaf.fmt = printk_skip_level(fmt); vaf.va = &args; if (limit_rate) - printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (sbi) + printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk_ratelimited("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); else - printk("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (sbi) + printk("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); va_end(args); } @@ -390,159 +485,90 @@ static void init_once(void *foo) #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype, - substring_t *args) +/* + * Note the name of the specified quota file. + */ +static int f2fs_note_qf_name(struct fs_context *fc, int qtype, + struct fs_parameter *param) { - struct super_block *sb = sbi->sb; + struct f2fs_fs_context *ctx = fc->fs_private; char *qname; - int ret = -EINVAL; - if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { - f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + if (param->size < 1) { + f2fs_err(NULL, "Missing quota name"); return -EINVAL; } - if (f2fs_sb_has_quota_ino(sbi)) { - f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + if (strchr(param->string, '/')) { + f2fs_err(NULL, "quotafile must be on filesystem root"); + return -EINVAL; + } + if (ctx->info.s_qf_names[qtype]) { + if (strcmp(ctx->info.s_qf_names[qtype], param->string) != 0) { + f2fs_err(NULL, "Quota file already specified"); + return -EINVAL; + } return 0; } - qname = match_strdup(args); + qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { - f2fs_err(sbi, "Not enough memory for storing quotafile name"); + f2fs_err(NULL, "Not enough memory for storing quotafile name"); return -ENOMEM; } - if (F2FS_OPTION(sbi).s_qf_names[qtype]) { - if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) - ret = 0; - else - f2fs_err(sbi, "%s quota file already specified", - QTYPE2NAME(qtype)); - goto errout; - } - if (strchr(qname, '/')) { - f2fs_err(sbi, "quotafile must be on filesystem root"); - goto errout; - } - F2FS_OPTION(sbi).s_qf_names[qtype] = qname; - set_opt(sbi, QUOTA); + F2FS_CTX_INFO(ctx).s_qf_names[qtype] = qname; + ctx->qname_mask |= 1 << qtype; return 0; -errout: - kfree(qname); - return ret; } -static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype) +/* + * Clear the name of the specified quota file. + */ +static int f2fs_unnote_qf_name(struct fs_context *fc, int qtype) { - struct super_block *sb = sbi->sb; + struct f2fs_fs_context *ctx = fc->fs_private; - if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { - f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); - return -EINVAL; - } - kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); - F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; + kfree(ctx->info.s_qf_names[qtype]); + ctx->info.s_qf_names[qtype] = NULL; + ctx->qname_mask |= 1 << qtype; return 0; } -static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +static void f2fs_unnote_qf_name_all(struct fs_context *fc) { - /* - * We do the test below only for project quotas. 'usrquota' and - * 'grpquota' mount options are allowed even without quota feature - * to support legacy quotas in quota files. - */ - if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) { - f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); - return -1; - } - if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || - F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || - F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { - if (test_opt(sbi, USRQUOTA) && - F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) - clear_opt(sbi, USRQUOTA); - - if (test_opt(sbi, GRPQUOTA) && - F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) - clear_opt(sbi, GRPQUOTA); - - if (test_opt(sbi, PRJQUOTA) && - F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) - clear_opt(sbi, PRJQUOTA); - - if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || - test_opt(sbi, PRJQUOTA)) { - f2fs_err(sbi, "old and new quota format mixing"); - return -1; - } - - if (!F2FS_OPTION(sbi).s_jquota_fmt) { - f2fs_err(sbi, "journaled quota format not specified"); - return -1; - } - } + int i; - if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) { - f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); - F2FS_OPTION(sbi).s_jquota_fmt = 0; - } - return 0; + for (i = 0; i < MAXQUOTAS; i++) + f2fs_unnote_qf_name(fc, i); } #endif -static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi, - const char *opt, - const substring_t *arg, - bool is_remount) +static int f2fs_parse_test_dummy_encryption(const struct fs_parameter *param, + struct f2fs_fs_context *ctx) { - struct fs_parameter param = { - .type = fs_value_is_string, - .string = arg->from ? arg->from : "", - }; - struct fscrypt_dummy_policy *policy = - &F2FS_OPTION(sbi).dummy_enc_policy; int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { - f2fs_warn(sbi, "test_dummy_encryption option not supported"); + f2fs_warn(NULL, "test_dummy_encryption option not supported"); return -EINVAL; } - - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - /* - * This mount option is just for testing, and it's not worthwhile to - * implement the extra complexity (e.g. RCU protection) that would be - * needed to allow it to be set or changed during remount. We do allow - * it to be specified during remount, but only if there is no change. - */ - if (is_remount && !fscrypt_is_dummy_policy_set(policy)) { - f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); - return -EINVAL; - } - - err = fscrypt_parse_test_dummy_encryption(¶m, policy); + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->info.dummy_enc_policy); if (err) { - if (err == -EEXIST) - f2fs_warn(sbi, - "Can't change test_dummy_encryption on remount"); - else if (err == -EINVAL) - f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", - opt); + if (err == -EINVAL) + f2fs_warn(NULL, "Value of option \"%s\" is unrecognized", + param->key); + else if (err == -EEXIST) + f2fs_warn(NULL, "Conflicting test_dummy_encryption options"); else - f2fs_warn(sbi, "Error processing option \"%s\" [%d]", - opt, err); + f2fs_warn(NULL, "Error processing option \"%s\" [%d]", + param->key, err); return -EINVAL; } - f2fs_warn(sbi, "Test dummy encryption mode enabled"); return 0; } #ifdef CONFIG_F2FS_FS_COMPRESSION -static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, +static bool is_compress_extension_exist(struct f2fs_mount_info *info, const char *new_ext, bool is_ext) { unsigned char (*ext)[F2FS_EXTENSION_LEN]; @@ -550,11 +576,11 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, int i; if (is_ext) { - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + ext = info->extensions; + ext_cnt = info->compress_ext_cnt; } else { - ext = F2FS_OPTION(sbi).noextensions; - ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + ext = info->noextensions; + ext_cnt = info->nocompress_ext_cnt; } for (i = 0; i < ext_cnt; i++) { @@ -572,28 +598,28 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, * extension will be treated as special cases and will not be compressed. * 3. Don't allow the non-compress extension specifies all files. */ -static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +static int f2fs_test_compress_extension(unsigned char (*noext)[F2FS_EXTENSION_LEN], + int noext_cnt, + unsigned char (*ext)[F2FS_EXTENSION_LEN], + int ext_cnt) { - unsigned char (*ext)[F2FS_EXTENSION_LEN]; - unsigned char (*noext)[F2FS_EXTENSION_LEN]; - int ext_cnt, noext_cnt, index = 0, no_index = 0; - - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; - noext = F2FS_OPTION(sbi).noextensions; - noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int index = 0, no_index = 0; if (!noext_cnt) return 0; for (no_index = 0; no_index < noext_cnt; no_index++) { + if (strlen(noext[no_index]) == 0) + continue; if (!strcasecmp("*", noext[no_index])) { - f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + f2fs_info(NULL, "Don't allow the nocompress extension specifies all files"); return -EINVAL; } for (index = 0; index < ext_cnt; index++) { + if (strlen(ext[index]) == 0) + continue; if (!strcasecmp(ext[index], noext[no_index])) { - f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + f2fs_info(NULL, "Don't allow the same extension %s appear in both compress and nocompress extension", ext[index]); return -EINVAL; } @@ -603,58 +629,62 @@ static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) } #ifdef CONFIG_F2FS_FS_LZ4 -static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +static int f2fs_set_lz4hc_level(struct f2fs_fs_context *ctx, const char *str) { #ifdef CONFIG_F2FS_FS_LZ4HC unsigned int level; if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } str += 3; if (str[0] != ':') { - f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtouint(str + 1, 10, &level)) return -EINVAL; if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) { - f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + f2fs_info(NULL, "invalid lz4hc compress level: %d", level); return -EINVAL; } - F2FS_OPTION(sbi).compress_level = level; + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; #else if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } - f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + f2fs_info(NULL, "kernel doesn't support lz4hc compression"); return -EINVAL; #endif } #endif #ifdef CONFIG_F2FS_FS_ZSTD -static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +static int f2fs_set_zstd_level(struct f2fs_fs_context *ctx, const char *str) { int level; int len = 4; if (strlen(str) == len) { - F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + F2FS_CTX_INFO(ctx).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } str += len; if (str[0] != ':') { - f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtoint(str + 1, 10, &level)) @@ -662,685 +692,750 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) /* f2fs does not support negative compress level now */ if (level < 0) { - f2fs_info(sbi, "do not support negative compress level: %d", level); + f2fs_info(NULL, "do not support negative compress level: %d", level); return -ERANGE; } if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { - f2fs_info(sbi, "invalid zstd compress level: %d", level); + f2fs_info(NULL, "invalid zstd compress level: %d", level); return -EINVAL; } - F2FS_OPTION(sbi).compress_level = level; + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; return 0; } #endif #endif -static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount) +static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; + struct f2fs_fs_context *ctx = fc->fs_private; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; unsigned char (*noext)[F2FS_EXTENSION_LEN]; int ext_cnt, noext_cnt; + char *name; #endif - char *p, *name; - int arg = 0; - kuid_t uid; - kgid_t gid; - int ret; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - int token; + substring_t args[MAX_OPT_ARGS]; + struct fs_parse_result result; + int token, ret, arg; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, f2fs_tokens, args); + token = fs_parse(fc, f2fs_param_specs, param, &result); + if (token < 0) + return token; - switch (token) { - case Opt_gc_background: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "on")) { - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; - } else if (!strcmp(name, "off")) { - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_warn(sbi, "zoned devices need bggc"); - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; - } else if (!strcmp(name, "sync")) { - F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_disable_roll_forward: - set_opt(sbi, DISABLE_ROLL_FORWARD); - break; - case Opt_norecovery: - /* requires ro mount, checked in f2fs_default_check */ - set_opt(sbi, NORECOVERY); - break; - case Opt_discard: - if (!f2fs_hw_support_discard(sbi)) { - f2fs_warn(sbi, "device does not support discard"); - break; - } - set_opt(sbi, DISCARD); - break; - case Opt_nodiscard: - if (f2fs_hw_should_discard(sbi)) { - f2fs_warn(sbi, "discard is required for zoned block devices"); - return -EINVAL; - } - clear_opt(sbi, DISCARD); - break; - case Opt_noheap: - case Opt_heap: - f2fs_warn(sbi, "heap/no_heap options were deprecated"); - break; + switch (token) { + case Opt_gc_background: + F2FS_CTX_INFO(ctx).bggc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_background_gc; + break; + case Opt_disable_roll_forward: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_ROLL_FORWARD); + break; + case Opt_norecovery: + /* requires ro mount, checked in f2fs_validate_options */ + ctx_set_opt(ctx, F2FS_MOUNT_NORECOVERY); + break; + case Opt_discard: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + else + ctx_set_opt(ctx, F2FS_MOUNT_DISCARD); + break; + case Opt_noheap: + case Opt_heap: + f2fs_warn(NULL, "heap/no_heap options were deprecated"); + break; #ifdef CONFIG_F2FS_FS_XATTR - case Opt_user_xattr: - set_opt(sbi, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt(sbi, XATTR_USER); - break; - case Opt_inline_xattr: - set_opt(sbi, INLINE_XATTR); - break; - case Opt_noinline_xattr: - clear_opt(sbi, INLINE_XATTR); - break; - case Opt_inline_xattr_size: - if (args->from && match_int(args, &arg)) - return -EINVAL; - set_opt(sbi, INLINE_XATTR_SIZE); - F2FS_OPTION(sbi).inline_xattr_size = arg; - break; + case Opt_user_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_XATTR_USER); + else + ctx_set_opt(ctx, F2FS_MOUNT_XATTR_USER); + break; + case Opt_inline_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + break; + case Opt_inline_xattr_size: + if (result.int_32 < MIN_INLINE_XATTR_SIZE || + result.int_32 > MAX_INLINE_XATTR_SIZE) { + f2fs_err(NULL, "inline xattr size is out of range: %u ~ %u", + (u32)MIN_INLINE_XATTR_SIZE, (u32)MAX_INLINE_XATTR_SIZE); + return -EINVAL; + } + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE); + F2FS_CTX_INFO(ctx).inline_xattr_size = result.int_32; + ctx->spec_mask |= F2FS_SPEC_inline_xattr_size; + break; #else - case Opt_user_xattr: - case Opt_nouser_xattr: - case Opt_inline_xattr: - case Opt_noinline_xattr: - case Opt_inline_xattr_size: - f2fs_info(sbi, "xattr options not supported"); - break; + case Opt_user_xattr: + case Opt_inline_xattr: + case Opt_inline_xattr_size: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL - case Opt_acl: - set_opt(sbi, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sbi, POSIX_ACL); - break; + case Opt_acl: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_POSIX_ACL); + else + ctx_set_opt(ctx, F2FS_MOUNT_POSIX_ACL); + break; #else - case Opt_acl: - case Opt_noacl: - f2fs_info(sbi, "acl options not supported"); - break; + case Opt_acl: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif - case Opt_active_logs: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg != 2 && arg != 4 && - arg != NR_CURSEG_PERSIST_TYPE) - return -EINVAL; - F2FS_OPTION(sbi).active_logs = arg; - break; - case Opt_disable_ext_identify: - set_opt(sbi, DISABLE_EXT_IDENTIFY); - break; - case Opt_inline_data: - set_opt(sbi, INLINE_DATA); - break; - case Opt_inline_dentry: - set_opt(sbi, INLINE_DENTRY); - break; - case Opt_noinline_dentry: - clear_opt(sbi, INLINE_DENTRY); - break; - case Opt_flush_merge: - set_opt(sbi, FLUSH_MERGE); - break; - case Opt_noflush_merge: - clear_opt(sbi, FLUSH_MERGE); - break; - case Opt_nobarrier: - set_opt(sbi, NOBARRIER); - break; - case Opt_barrier: - clear_opt(sbi, NOBARRIER); - break; - case Opt_fastboot: - set_opt(sbi, FASTBOOT); - break; - case Opt_extent_cache: - set_opt(sbi, READ_EXTENT_CACHE); - break; - case Opt_noextent_cache: - if (f2fs_sb_has_device_alias(sbi)) { - f2fs_err(sbi, "device aliasing requires extent cache"); - return -EINVAL; - } - clear_opt(sbi, READ_EXTENT_CACHE); - break; - case Opt_noinline_data: - clear_opt(sbi, INLINE_DATA); - break; - case Opt_data_flush: - set_opt(sbi, DATA_FLUSH); - break; - case Opt_reserve_root: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (test_opt(sbi, RESERVE_ROOT)) { - f2fs_info(sbi, "Preserve previous reserve_root=%u", - F2FS_OPTION(sbi).root_reserved_blocks); - } else { - F2FS_OPTION(sbi).root_reserved_blocks = arg; - set_opt(sbi, RESERVE_ROOT); - } - break; - case Opt_resuid: - if (args->from && match_int(args, &arg)) - return -EINVAL; - uid = make_kuid(current_user_ns(), arg); - if (!uid_valid(uid)) { - f2fs_err(sbi, "Invalid uid value %d", arg); - return -EINVAL; - } - F2FS_OPTION(sbi).s_resuid = uid; - break; - case Opt_resgid: - if (args->from && match_int(args, &arg)) - return -EINVAL; - gid = make_kgid(current_user_ns(), arg); - if (!gid_valid(gid)) { - f2fs_err(sbi, "Invalid gid value %d", arg); - return -EINVAL; - } - F2FS_OPTION(sbi).s_resgid = gid; - break; - case Opt_mode: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "adaptive")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; - } else if (!strcmp(name, "lfs")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - } else if (!strcmp(name, "fragment:segment")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; - } else if (!strcmp(name, "fragment:block")) { - F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; + case Opt_active_logs: + if (result.int_32 != 2 && result.int_32 != 4 && + result.int_32 != NR_CURSEG_PERSIST_TYPE) + return -EINVAL; + ctx->spec_mask |= F2FS_SPEC_active_logs; + F2FS_CTX_INFO(ctx).active_logs = result.int_32; + break; + case Opt_disable_ext_identify: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DATA); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DATA); + break; + case Opt_inline_dentry: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + break; + case Opt_flush_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + break; + case Opt_barrier: + if (result.negated) + ctx_set_opt(ctx, F2FS_MOUNT_NOBARRIER); + else + ctx_clear_opt(ctx, F2FS_MOUNT_NOBARRIER); + break; + case Opt_fastboot: + ctx_set_opt(ctx, F2FS_MOUNT_FASTBOOT); + break; + case Opt_extent_cache: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + else + ctx_set_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + break; + case Opt_data_flush: + ctx_set_opt(ctx, F2FS_MOUNT_DATA_FLUSH); + break; + case Opt_reserve_root: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_root; + break; + case Opt_resuid: + F2FS_CTX_INFO(ctx).s_resuid = result.uid; + ctx->spec_mask |= F2FS_SPEC_resuid; + break; + case Opt_resgid: + F2FS_CTX_INFO(ctx).s_resgid = result.gid; + ctx->spec_mask |= F2FS_SPEC_resgid; + break; + case Opt_mode: + F2FS_CTX_INFO(ctx).fs_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_mode; + break; #ifdef CONFIG_F2FS_FAULT_INJECTION - case Opt_fault_injection: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (f2fs_build_fault_attr(sbi, arg, 0, FAULT_RATE)) - return -EINVAL; - set_opt(sbi, FAULT_INJECTION); - break; + case Opt_fault_injection: + F2FS_CTX_INFO(ctx).fault_info.inject_rate = result.int_32; + ctx->spec_mask |= F2FS_SPEC_fault_injection; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; - case Opt_fault_type: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (f2fs_build_fault_attr(sbi, 0, arg, FAULT_TYPE)) - return -EINVAL; - set_opt(sbi, FAULT_INJECTION); - break; + case Opt_fault_type: + if (result.uint_32 > BIT(FAULT_MAX)) + return -EINVAL; + F2FS_CTX_INFO(ctx).fault_info.inject_type = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fault_type; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; #else - case Opt_fault_injection: - case Opt_fault_type: - f2fs_info(sbi, "fault injection options not supported"); - break; + case Opt_fault_injection: + case Opt_fault_type: + f2fs_info(NULL, "%s options not supported", param->key); + break; #endif - case Opt_lazytime: - set_opt(sbi, LAZYTIME); - break; - case Opt_nolazytime: - clear_opt(sbi, LAZYTIME); - break; + case Opt_lazytime: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_LAZYTIME); + else + ctx_set_opt(ctx, F2FS_MOUNT_LAZYTIME); + break; #ifdef CONFIG_QUOTA - case Opt_quota: - case Opt_usrquota: - set_opt(sbi, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sbi, GRPQUOTA); - break; - case Opt_prjquota: - set_opt(sbi, PRJQUOTA); - break; - case Opt_usrjquota: - ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_grpjquota: - ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_prjjquota: - ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]); - if (ret) - return ret; - break; - case Opt_offusrjquota: - ret = f2fs_clear_qf_name(sbi, USRQUOTA); - if (ret) - return ret; - break; - case Opt_offgrpjquota: - ret = f2fs_clear_qf_name(sbi, GRPQUOTA); - if (ret) - return ret; - break; - case Opt_offprjjquota: - ret = f2fs_clear_qf_name(sbi, PRJQUOTA); - if (ret) - return ret; - break; - case Opt_jqfmt_vfsold: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; - break; - case Opt_jqfmt_vfsv0: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; - break; - case Opt_jqfmt_vfsv1: - F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; - break; - case Opt_noquota: - clear_opt(sbi, QUOTA); - clear_opt(sbi, USRQUOTA); - clear_opt(sbi, GRPQUOTA); - clear_opt(sbi, PRJQUOTA); - break; + case Opt_quota: + if (result.negated) { + ctx_clear_opt(ctx, F2FS_MOUNT_QUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + } else + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_usrquota: + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_grpquota: + ctx_set_opt(ctx, F2FS_MOUNT_GRPQUOTA); + break; + case Opt_prjquota: + ctx_set_opt(ctx, F2FS_MOUNT_PRJQUOTA); + break; + case Opt_usrjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, USRQUOTA); + else + ret = f2fs_note_qf_name(fc, USRQUOTA, param); + if (ret) + return ret; + break; + case Opt_grpjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, GRPQUOTA); + else + ret = f2fs_note_qf_name(fc, GRPQUOTA, param); + if (ret) + return ret; + break; + case Opt_prjjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, PRJQUOTA); + else + ret = f2fs_note_qf_name(fc, PRJQUOTA, param); + if (ret) + return ret; + break; + case Opt_jqfmt: + F2FS_CTX_INFO(ctx).s_jquota_fmt = result.int_32; + ctx->spec_mask |= F2FS_SPEC_jqfmt; + break; #else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - case Opt_prjquota: - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_prjjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_offprjjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - case Opt_noquota: - f2fs_info(sbi, "quota operations not supported"); - break; + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + f2fs_info(NULL, "quota operations not supported"); + break; #endif - case Opt_alloc: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - - if (!strcmp(name, "default")) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; - } else if (!strcmp(name, "reuse")) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_fsync: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "posix")) { - F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - } else if (!strcmp(name, "strict")) { - F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; - } else if (!strcmp(name, "nobarrier")) { - F2FS_OPTION(sbi).fsync_mode = - FSYNC_MODE_NOBARRIER; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_test_dummy_encryption: - ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0], - is_remount); - if (ret) - return ret; - break; - case Opt_inlinecrypt: + case Opt_alloc: + F2FS_CTX_INFO(ctx).alloc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_alloc_mode; + break; + case Opt_fsync: + F2FS_CTX_INFO(ctx).fsync_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fsync_mode; + break; + case Opt_test_dummy_encryption: + ret = f2fs_parse_test_dummy_encryption(param, ctx); + if (ret) + return ret; + break; + case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - set_opt(sbi, INLINECRYPT); + ctx_set_opt(ctx, F2FS_MOUNT_INLINECRYPT); #else - f2fs_info(sbi, "inline encryption not supported"); + f2fs_info(NULL, "inline encryption not supported"); #endif - break; + break; + case Opt_checkpoint: + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].from = args[0].to = NULL; + arg = 0; + + /* revert to match_table for checkpoint= options */ + token = match_token(param->string, f2fs_checkpoint_tokens, args); + switch (token) { case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) return -EINVAL; if (arg < 0 || arg > 100) return -EINVAL; - F2FS_OPTION(sbi).unusable_cap_perc = arg; - set_opt(sbi, DISABLE_CHECKPOINT); + F2FS_CTX_INFO(ctx).unusable_cap_perc = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable_cap: if (args->from && match_int(args, &arg)) return -EINVAL; - F2FS_OPTION(sbi).unusable_cap = arg; - set_opt(sbi, DISABLE_CHECKPOINT); + F2FS_CTX_INFO(ctx).unusable_cap = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable: - set_opt(sbi, DISABLE_CHECKPOINT); + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: - clear_opt(sbi, DISABLE_CHECKPOINT); - break; - case Opt_checkpoint_merge: - set_opt(sbi, MERGE_CHECKPOINT); - break; - case Opt_nocheckpoint_merge: - clear_opt(sbi, MERGE_CHECKPOINT); + ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; + default: + return -EINVAL; + } + break; + case Opt_checkpoint_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + else + ctx_set_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION - case Opt_compress_algorithm: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "lzo")) { + case Opt_compress_algorithm: + name = param->string; + if (!strcmp(name, "lzo")) { #ifdef CONFIG_F2FS_FS_LZO - F2FS_OPTION(sbi).compress_level = 0; - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZO; + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZO; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lzo compression"); + f2fs_info(NULL, "kernel doesn't support lzo compression"); #endif - } else if (!strncmp(name, "lz4", 3)) { + } else if (!strncmp(name, "lz4", 3)) { #ifdef CONFIG_F2FS_FS_LZ4 - ret = f2fs_set_lz4hc_level(sbi, name); - if (ret) { - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZ4; + ret = f2fs_set_lz4hc_level(ctx, name); + if (ret) + return -EINVAL; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZ4; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lz4 compression"); + f2fs_info(NULL, "kernel doesn't support lz4 compression"); #endif - } else if (!strncmp(name, "zstd", 4)) { + } else if (!strncmp(name, "zstd", 4)) { #ifdef CONFIG_F2FS_FS_ZSTD - ret = f2fs_set_zstd_level(sbi, name); - if (ret) { - kfree(name); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_ZSTD; + ret = f2fs_set_zstd_level(ctx, name); + if (ret) + return -EINVAL; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_ZSTD; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support zstd compression"); + f2fs_info(NULL, "kernel doesn't support zstd compression"); #endif - } else if (!strcmp(name, "lzo-rle")) { + } else if (!strcmp(name, "lzo-rle")) { #ifdef CONFIG_F2FS_FS_LZORLE - F2FS_OPTION(sbi).compress_level = 0; - F2FS_OPTION(sbi).compress_algorithm = - COMPRESS_LZORLE; + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZORLE; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - f2fs_info(sbi, "kernel doesn't support lzorle compression"); + f2fs_info(NULL, "kernel doesn't support lzorle compression"); #endif - } else { - kfree(name); - return -EINVAL; - } - kfree(name); + } else + return -EINVAL; + break; + case Opt_compress_log_size: + if (result.uint_32 < MIN_COMPRESS_LOG_SIZE || + result.uint_32 > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(NULL, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_CTX_INFO(ctx).compress_log_size = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_log_size; + break; + case Opt_compress_extension: + name = param->string; + ext = F2FS_CTX_INFO(ctx).extensions; + ext_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, true)) break; - case Opt_compress_log_size: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg < MIN_COMPRESS_LOG_SIZE || - arg > MAX_COMPRESS_LOG_SIZE) { - f2fs_err(sbi, - "Compress cluster log size is out of range"); - return -EINVAL; - } - F2FS_OPTION(sbi).compress_log_size = arg; + + ret = strscpy(ext[ext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).compress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_compress_extension; + break; + case Opt_nocompress_extension: + name = param->string; + noext = F2FS_CTX_INFO(ctx).noextensions; + noext_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, false)) break; - case Opt_compress_extension: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - ext = F2FS_OPTION(sbi).extensions; - ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + ret = strscpy(noext[noext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).nocompress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_nocompress_extension; + break; + case Opt_compress_chksum: + F2FS_CTX_INFO(ctx).compress_chksum = true; + ctx->spec_mask |= F2FS_SPEC_compress_chksum; + break; + case Opt_compress_mode: + F2FS_CTX_INFO(ctx).compress_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_mode; + break; + case Opt_compress_cache: + ctx_set_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE); + break; +#else + case Opt_compress_algorithm: + case Opt_compress_log_size: + case Opt_compress_extension: + case Opt_nocompress_extension: + case Opt_compress_chksum: + case Opt_compress_mode: + case Opt_compress_cache: + f2fs_info(NULL, "compression options not supported"); + break; +#endif + case Opt_atgc: + ctx_set_opt(ctx, F2FS_MOUNT_ATGC); + break; + case Opt_gc_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_GC_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_GC_MERGE); + break; + case Opt_discard_unit: + F2FS_CTX_INFO(ctx).discard_unit = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_discard_unit; + break; + case Opt_memory_mode: + F2FS_CTX_INFO(ctx).memory_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_memory_mode; + break; + case Opt_age_extent_cache: + ctx_set_opt(ctx, F2FS_MOUNT_AGE_EXTENT_CACHE); + break; + case Opt_errors: + F2FS_CTX_INFO(ctx).errors = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_errors; + break; + case Opt_nat_bits: + ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); + break; + } + return 0; +} - if (strlen(name) >= F2FS_EXTENSION_LEN || - ext_cnt >= COMPRESS_EXT_NUM) { - f2fs_err(sbi, - "invalid extension length/number"); - kfree(name); - return -EINVAL; - } +/* + * Check quota settings consistency. + */ +static int f2fs_check_quota_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + #ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + bool quota_turnon = sb_any_quota_loaded(sb); + char *old_qname, *new_qname; + bool usr_qf_name, grp_qf_name, prj_qf_name, usrquota, grpquota, prjquota; + int i; - if (is_compress_extension_exist(sbi, name, true)) { - kfree(name); - break; - } + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA) && + !f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); + return -EINVAL; + } - ret = strscpy(ext[ext_cnt], name); - if (ret < 0) { - kfree(name); - return ret; - } - F2FS_OPTION(sbi).compress_ext_cnt++; - kfree(name); - break; - case Opt_nocompress_extension: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; + if (ctx->qname_mask) { + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; - noext = F2FS_OPTION(sbi).noextensions; - noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + old_qname = F2FS_OPTION(sbi).s_qf_names[i]; + new_qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (quota_turnon && + !!old_qname != !!new_qname) + goto err_jquota_change; - if (strlen(name) >= F2FS_EXTENSION_LEN || - noext_cnt >= COMPRESS_EXT_NUM) { - f2fs_err(sbi, - "invalid extension length/number"); - kfree(name); - return -EINVAL; + if (old_qname) { + if (strcmp(old_qname, new_qname) == 0) { + ctx->qname_mask &= ~(1 << i); + continue; + } + goto err_jquota_specified; } - if (is_compress_extension_exist(sbi, name, false)) { - kfree(name); - break; + if (quota_feature) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + ctx->qname_mask &= ~(1 << i); + kfree(F2FS_CTX_INFO(ctx).s_qf_names[i]); + F2FS_CTX_INFO(ctx).s_qf_names[i] = NULL; } + } + } + + /* Make sure we don't mix old and new quota format */ + usr_qf_name = F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[USRQUOTA]; + grp_qf_name = F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[GRPQUOTA]; + prj_qf_name = F2FS_OPTION(sbi).s_qf_names[PRJQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[PRJQUOTA]; + usrquota = test_opt(sbi, USRQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_USRQUOTA); + grpquota = test_opt(sbi, GRPQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_GRPQUOTA); + prjquota = test_opt(sbi, PRJQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA); + + if (usr_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + usrquota = false; + } + if (grp_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + grpquota = false; + } + if (prj_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + prjquota = false; + } + if (usr_qf_name || grp_qf_name || prj_qf_name) { + if (grpquota || usrquota || prjquota) { + f2fs_err(sbi, "old and new quota format mixing"); + return -EINVAL; + } + if (!(ctx->spec_mask & F2FS_SPEC_jqfmt || + F2FS_OPTION(sbi).s_jquota_fmt)) { + f2fs_err(sbi, "journaled quota format not specified"); + return -EINVAL; + } + } + return 0; + +err_jquota_change: + f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + return -EINVAL; +err_jquota_specified: + f2fs_err(sbi, "%s quota file already specified", + QTYPE2NAME(i)); + return -EINVAL; - ret = strscpy(noext[noext_cnt], name); - if (ret < 0) { - kfree(name); - return ret; - } - F2FS_OPTION(sbi).nocompress_ext_cnt++; - kfree(name); - break; - case Opt_compress_chksum: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - F2FS_OPTION(sbi).compress_chksum = true; - break; - case Opt_compress_mode: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "fs")) { - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; - } else if (!strcmp(name, "user")) { - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_compress_cache: - if (!f2fs_sb_has_compression(sbi)) { - f2fs_info(sbi, "Image doesn't support compression"); - break; - } - set_opt(sbi, COMPRESS_CACHE); - break; #else - case Opt_compress_algorithm: - case Opt_compress_log_size: - case Opt_compress_extension: - case Opt_nocompress_extension: - case Opt_compress_chksum: - case Opt_compress_mode: - case Opt_compress_cache: - f2fs_info(sbi, "compression options not supported"); - break; + if (f2fs_readonly(sbi->sb)) + return 0; + if (f2fs_sb_has_quota_ino(sbi)) { + f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + if (f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + + return 0; #endif - case Opt_atgc: - set_opt(sbi, ATGC); - break; - case Opt_gc_merge: - set_opt(sbi, GC_MERGE); - break; - case Opt_nogc_merge: - clear_opt(sbi, GC_MERGE); - break; - case Opt_discard_unit: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "block")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_BLOCK; - } else if (!strcmp(name, "segment")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SEGMENT; - } else if (!strcmp(name, "section")) { - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SECTION; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; - case Opt_memory_mode: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "normal")) { - F2FS_OPTION(sbi).memory_mode = - MEMORY_MODE_NORMAL; - } else if (!strcmp(name, "low")) { - F2FS_OPTION(sbi).memory_mode = - MEMORY_MODE_LOW; - } else { - kfree(name); - return -EINVAL; +} + +static int f2fs_check_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&F2FS_OPTION(sbi).dummy_enc_policy, + &F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + f2fs_warn(sbi, "Can't set or change test_dummy_encryption on remount"); + return -EINVAL; + } + return 0; +} + +static inline bool test_compression_spec(unsigned int mask) +{ + return mask & (F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static inline void clear_compression_spec(struct f2fs_fs_context *ctx) +{ + ctx->spec_mask &= ~(F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static int f2fs_check_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i, cnt; + + if (!f2fs_sb_has_compression(sbi)) { + if (test_compression_spec(ctx->spec_mask) || + ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE)) + f2fs_info(sbi, "Image doesn't support compression"); + clear_compression_spec(ctx); + ctx->opt_mask &= ~F2FS_MOUNT_COMPRESS_CACHE; + return 0; + } + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).compress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).extensions[i], true)) { + F2FS_CTX_INFO(ctx).extensions[i][0] = '\0'; + cnt--; } - kfree(name); - break; - case Opt_age_extent_cache: - set_opt(sbi, AGE_EXTENT_CACHE); - break; - case Opt_errors: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "remount-ro")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_READONLY; - } else if (!strcmp(name, "continue")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_CONTINUE; - } else if (!strcmp(name, "panic")) { - F2FS_OPTION(sbi).errors = - MOUNT_ERRORS_PANIC; - } else { - kfree(name); - return -EINVAL; + } + if (F2FS_OPTION(sbi).compress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid extension length/number"); + return -EINVAL; + } + } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).nocompress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).noextensions[i], false)) { + F2FS_CTX_INFO(ctx).noextensions[i][0] = '\0'; + cnt--; } - kfree(name); - break; - case Opt_nat_bits: - set_opt(sbi, NAT_BITS); - break; - default: - f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", - p); + } + if (F2FS_OPTION(sbi).nocompress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid noextension length/number"); return -EINVAL; } } + + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with new extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_OPTION(sbi).extensions, + F2FS_OPTION(sbi).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with old extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_OPTION(sbi).noextensions, + F2FS_OPTION(sbi).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new extensions conflicts with old noextensions"); + return -EINVAL; + } +#endif return 0; } -static int f2fs_default_check(struct f2fs_sb_info *sbi) +static int f2fs_check_opt_consistency(struct fs_context *fc, + struct super_block *sb) { -#ifdef CONFIG_QUOTA - if (f2fs_check_quota_options(sbi)) + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + if (ctx_test_opt(ctx, F2FS_MOUNT_NORECOVERY) && !f2fs_readonly(sb)) return -EINVAL; -#else - if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) { - f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + + if (f2fs_hw_should_discard(sbi) && + (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + !ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } - if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) { - f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + + if (!f2fs_hw_support_discard(sbi) && + (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "device does not support discard"); + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + ctx->opt_mask &= ~F2FS_MOUNT_DISCARD; + } + + if (f2fs_sb_has_device_alias(sbi) && + (ctx->opt_mask & F2FS_MOUNT_READ_EXTENT_CACHE) && + !ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } -#endif + + if (test_opt(sbi, RESERVE_ROOT) && + (ctx->opt_mask & F2FS_MOUNT_RESERVE_ROOT) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) { + f2fs_info(sbi, "Preserve previous reserve_root=%u", + F2FS_OPTION(sbi).root_reserved_blocks); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; + } + + err = f2fs_check_test_dummy_encryption(fc, sb); + if (err) + return err; + + err = f2fs_check_compression(fc, sb); + if (err) + return err; + + err = f2fs_check_quota_consistency(fc, sb); + if (err) + return err; if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, @@ -1354,15 +1449,19 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) * devices, but mandatory for host-managed zoned block devices. */ if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_CTX_INFO(ctx).bggc_mode == BGGC_MODE_OFF) { + f2fs_warn(sbi, "zoned devices need bggc"); + return -EINVAL; + } #ifdef CONFIG_BLK_DEV_ZONED - if (F2FS_OPTION(sbi).discard_unit != - DISCARD_UNIT_SECTION) { + if ((ctx->spec_mask & F2FS_SPEC_discard_unit) && + F2FS_CTX_INFO(ctx).discard_unit != DISCARD_UNIT_SECTION) { f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); - F2FS_OPTION(sbi).discard_unit = - DISCARD_UNIT_SECTION; + F2FS_CTX_INFO(ctx).discard_unit = DISCARD_UNIT_SECTION; } - if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) { + if ((ctx->spec_mask & F2FS_SPEC_mode) && + F2FS_CTX_INFO(ctx).fs_mode != FS_MODE_LFS) { f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); return -EINVAL; } @@ -1372,43 +1471,25 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) #endif } -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_test_compress_extension(sbi)) { - f2fs_err(sbi, "invalid compress or nocompress extension"); - return -EINVAL; - } -#endif - - if (test_opt(sbi, INLINE_XATTR_SIZE)) { - int min_size, max_size; - + if (ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE)) { if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off"); return -EINVAL; } - if (!test_opt(sbi, INLINE_XATTR)) { + if (!ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR) && !test_opt(sbi, INLINE_XATTR)) { f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option"); return -EINVAL; } - - min_size = MIN_INLINE_XATTR_SIZE; - max_size = MAX_INLINE_XATTR_SIZE; - - if (F2FS_OPTION(sbi).inline_xattr_size < min_size || - F2FS_OPTION(sbi).inline_xattr_size > max_size) { - f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d", - min_size, max_size); - return -EINVAL; - } } - if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + if (ctx_test_opt(ctx, F2FS_MOUNT_ATGC) && + F2FS_CTX_INFO(ctx).fs_mode == FS_MODE_LFS) { f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; } - if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { + if (f2fs_is_readonly(sbi) && ctx_test_opt(ctx, F2FS_MOUNT_FLUSH_MERGE)) { f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); return -EINVAL; } @@ -1417,12 +1498,190 @@ static int f2fs_default_check(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } + return 0; +} + +static void f2fs_apply_quota_options(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + char *qname; + int i; + + if (quota_feature) + return; + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; + + qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (qname) { + qname = kstrdup(F2FS_CTX_INFO(ctx).s_qf_names[i], + GFP_KERNEL | __GFP_NOFAIL); + set_opt(sbi, QUOTA); + } + F2FS_OPTION(sbi).s_qf_names[i] = qname; + } + + if (ctx->spec_mask & F2FS_SPEC_jqfmt) + F2FS_OPTION(sbi).s_jquota_fmt = F2FS_CTX_INFO(ctx).s_jquota_fmt; + + if (quota_feature && F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); + F2FS_OPTION(sbi).s_jquota_fmt = 0; + } +#endif +} + +static void f2fs_apply_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&F2FS_OPTION(sbi).dummy_enc_policy)) + return; + swap(F2FS_OPTION(sbi).dummy_enc_policy, F2FS_CTX_INFO(ctx).dummy_enc_policy); + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +} - if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { - f2fs_err(sbi, "norecovery requires readonly mount"); +static void f2fs_apply_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned char (*ctx_ext)[F2FS_EXTENSION_LEN]; + unsigned char (*sbi_ext)[F2FS_EXTENSION_LEN]; + int ctx_cnt, sbi_cnt, i; + + if (ctx->spec_mask & F2FS_SPEC_compress_level) + F2FS_OPTION(sbi).compress_level = + F2FS_CTX_INFO(ctx).compress_level; + if (ctx->spec_mask & F2FS_SPEC_compress_algorithm) + F2FS_OPTION(sbi).compress_algorithm = + F2FS_CTX_INFO(ctx).compress_algorithm; + if (ctx->spec_mask & F2FS_SPEC_compress_log_size) + F2FS_OPTION(sbi).compress_log_size = + F2FS_CTX_INFO(ctx).compress_log_size; + if (ctx->spec_mask & F2FS_SPEC_compress_chksum) + F2FS_OPTION(sbi).compress_chksum = + F2FS_CTX_INFO(ctx).compress_chksum; + if (ctx->spec_mask & F2FS_SPEC_compress_mode) + F2FS_OPTION(sbi).compress_mode = + F2FS_CTX_INFO(ctx).compress_mode; + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).extensions; + ctx_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).extensions; + sbi_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).compress_ext_cnt = sbi_cnt; + } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).noextensions; + ctx_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).noextensions; + sbi_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).nocompress_ext_cnt = sbi_cnt; + } +#endif +} + +static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + F2FS_OPTION(sbi).opt &= ~ctx->opt_mask; + F2FS_OPTION(sbi).opt |= F2FS_CTX_INFO(ctx).opt; + + if (ctx->spec_mask & F2FS_SPEC_background_gc) + F2FS_OPTION(sbi).bggc_mode = F2FS_CTX_INFO(ctx).bggc_mode; + if (ctx->spec_mask & F2FS_SPEC_inline_xattr_size) + F2FS_OPTION(sbi).inline_xattr_size = + F2FS_CTX_INFO(ctx).inline_xattr_size; + if (ctx->spec_mask & F2FS_SPEC_active_logs) + F2FS_OPTION(sbi).active_logs = F2FS_CTX_INFO(ctx).active_logs; + if (ctx->spec_mask & F2FS_SPEC_reserve_root) + F2FS_OPTION(sbi).root_reserved_blocks = + F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_resgid) + F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; + if (ctx->spec_mask & F2FS_SPEC_resuid) + F2FS_OPTION(sbi).s_resuid = F2FS_CTX_INFO(ctx).s_resuid; + if (ctx->spec_mask & F2FS_SPEC_mode) + F2FS_OPTION(sbi).fs_mode = F2FS_CTX_INFO(ctx).fs_mode; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (ctx->spec_mask & F2FS_SPEC_fault_injection) + (void)f2fs_build_fault_attr(sbi, + F2FS_CTX_INFO(ctx).fault_info.inject_rate, 0, FAULT_RATE); + if (ctx->spec_mask & F2FS_SPEC_fault_type) + (void)f2fs_build_fault_attr(sbi, 0, + F2FS_CTX_INFO(ctx).fault_info.inject_type, FAULT_TYPE); +#endif + if (ctx->spec_mask & F2FS_SPEC_alloc_mode) + F2FS_OPTION(sbi).alloc_mode = F2FS_CTX_INFO(ctx).alloc_mode; + if (ctx->spec_mask & F2FS_SPEC_fsync_mode) + F2FS_OPTION(sbi).fsync_mode = F2FS_CTX_INFO(ctx).fsync_mode; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap) + F2FS_OPTION(sbi).unusable_cap = F2FS_CTX_INFO(ctx).unusable_cap; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap_perc) + F2FS_OPTION(sbi).unusable_cap_perc = + F2FS_CTX_INFO(ctx).unusable_cap_perc; + if (ctx->spec_mask & F2FS_SPEC_discard_unit) + F2FS_OPTION(sbi).discard_unit = F2FS_CTX_INFO(ctx).discard_unit; + if (ctx->spec_mask & F2FS_SPEC_memory_mode) + F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; + if (ctx->spec_mask & F2FS_SPEC_errors) + F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + + f2fs_apply_compression(fc, sb); + f2fs_apply_test_dummy_encryption(fc, sb); + f2fs_apply_quota_options(fc, sb); +} + +static int f2fs_sanity_check_options(struct f2fs_sb_info *sbi, bool remount) +{ + if (f2fs_sb_has_device_alias(sbi) && + !test_opt(sbi, READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } + if (!remount) + return 0; + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && + sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { + f2fs_err(sbi, + "zoned: max open zones %u is too small, need at least %u open zones", + sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); + return -EINVAL; + } +#endif + if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { + f2fs_warn(sbi, "LFS is not compatible with IPU"); + return -EINVAL; + } return 0; } @@ -1442,6 +1701,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); + atomic_set(&fi->open_count, 0); init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); @@ -1718,7 +1978,7 @@ static void f2fs_put_super(struct super_block *sb) destroy_percpu_info(sbi); f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) - kvfree(sbi->write_io[i]); + kfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif @@ -2329,11 +2589,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_flush_ckpt_thread(sbi); } -static int f2fs_remount(struct super_block *sb, int *flags, char *data) +static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; + unsigned int flags = fc->sb_flags; int err; bool need_restart_gc = false, need_stop_gc = false; bool need_restart_flush = false, need_stop_flush = false; @@ -2379,7 +2640,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #endif /* recover superblocks we couldn't write due to previous RO mount */ - if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + if (!(flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); f2fs_info(sbi, "Try to recover all the superblocks, ret: %d", err); @@ -2389,23 +2650,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi, true); - /* parse mount options */ - err = parse_options(sbi, data, true); + err = f2fs_check_opt_consistency(fc, sb); if (err) goto restore_opts; -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi) && - sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { - f2fs_err(sbi, - "zoned: max open zones %u is too small, need at least %u open zones", - sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); - err = -EINVAL; - goto restore_opts; - } -#endif + f2fs_apply_options(fc, sb); - err = f2fs_default_check(sbi); + err = f2fs_sanity_check_options(sbi, true); if (err) goto restore_opts; @@ -2416,20 +2667,20 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) + if (f2fs_readonly(sb) && (flags & SB_RDONLY)) goto skip; - if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) { + if (f2fs_dev_is_readonly(sbi) && !(flags & SB_RDONLY)) { err = -EROFS; goto restore_opts; } #ifdef CONFIG_QUOTA - if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { + if (!f2fs_readonly(sb) && (flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) { + } else if (f2fs_readonly(sb) && !(flags & SB_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { @@ -2441,12 +2692,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif - if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { - err = -EINVAL; - f2fs_warn(sbi, "LFS is not compatible with IPU"); - goto restore_opts; - } - /* disallow enable atgc dynamically */ if (no_atgc == !!test_opt(sbi, ATGC)) { err = -EINVAL; @@ -2485,7 +2730,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { + if ((flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); goto restore_opts; @@ -2496,7 +2741,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & SB_RDONLY) || + if ((flags & SB_RDONLY) || (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { @@ -2510,7 +2755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY) { + if (flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2523,7 +2768,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. */ - if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + if ((flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); need_restart_flush = true; @@ -2565,11 +2810,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * triggered while remount and we need to take care of it before * returning from remount. */ - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + if ((flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || !test_opt(sbi, MERGE_CHECKPOINT)) { f2fs_stop_ckpt_thread(sbi); } else { - /* Flush if the prevous checkpoint, if exists. */ + /* Flush if the previous checkpoint, if exists. */ f2fs_flush_ckpt_thread(sbi); err = f2fs_start_ckpt_thread(sbi); @@ -2592,7 +2837,7 @@ skip: (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + fc->sb_flags = (flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); sbi->umount_lock_holder = NULL; return 0; @@ -3263,7 +3508,6 @@ static const struct super_operations f2fs_sops = { .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, - .remount_fs = f2fs_remount, .shutdown = f2fs_shutdown, }; @@ -3451,6 +3695,7 @@ static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio, f2fs_bug_on(sbi, 1); ret = submit_bio_wait(bio); + bio_put(bio); folio_end_writeback(folio); return ret; @@ -4522,14 +4767,14 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) sbi->readdir_ra = true; } -static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +static int f2fs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct f2fs_fs_context *ctx = fc->fs_private; struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; struct inode *root; int err; bool skip_recovery = false, need_fsck = false; - char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; int retry_cnt = 1; @@ -4592,18 +4837,14 @@ try_onemore: sizeof(raw_super->uuid)); default_options(sbi, false); - /* parse mount options */ - options = kstrdup((const char *)data, GFP_KERNEL); - if (data && !options) { - err = -ENOMEM; - goto free_sb_buf; - } - err = parse_options(sbi, options, false); + err = f2fs_check_opt_consistency(fc, sb); if (err) - goto free_options; + goto free_sb_buf; + + f2fs_apply_options(fc, sb); - err = f2fs_default_check(sbi); + err = f2fs_sanity_check_options(sbi, false); if (err) goto free_options; @@ -4770,6 +5011,10 @@ try_onemore: /* get segno of first zoned block device */ sbi->first_seq_zone_segno = get_first_seq_zone_segno(sbi); + sbi->reserved_pin_section = f2fs_sb_has_blkzoned(sbi) ? + ZONED_PIN_SEC_REQUIRED_COUNT : + GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)); + /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) @@ -4930,7 +5175,6 @@ reset_checkpoint: if (err) goto sync_free_meta; } - kvfree(options); /* recover broken superblock */ if (recovery) { @@ -5013,7 +5257,7 @@ free_iostat: f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) - kvfree(sbi->write_io[i]); + kfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -5024,8 +5268,8 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); - kvfree(options); + /* no need to free dummy_enc_policy, we just keep it in ctx when failed */ + swap(F2FS_CTX_INFO(ctx).dummy_enc_policy, F2FS_OPTION(sbi).dummy_enc_policy); free_sb_buf: kfree(raw_super); free_sbi: @@ -5041,12 +5285,39 @@ free_sbi: return err; } -static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int f2fs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); + return get_tree_bdev(fc, f2fs_fill_super); } +static int f2fs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + + return __f2fs_remount(fc, sb); +} + +static void f2fs_fc_free(struct fs_context *fc) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + + if (!ctx) + return; + +#ifdef CONFIG_QUOTA + f2fs_unnote_qf_name_all(fc); +#endif + fscrypt_free_dummy_policy(&F2FS_CTX_INFO(ctx).dummy_enc_policy); + kfree(ctx); +} + +static const struct fs_context_operations f2fs_context_ops = { + .parse_param = f2fs_parse_param, + .get_tree = f2fs_get_tree, + .reconfigure = f2fs_reconfigure, + .free = f2fs_fc_free, +}; + static void kill_f2fs_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -5088,10 +5359,24 @@ static void kill_f2fs_super(struct super_block *sb) } } +static int f2fs_init_fs_context(struct fs_context *fc) +{ + struct f2fs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct f2fs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &f2fs_context_ops; + + return 0; +} + static struct file_system_type f2fs_fs_type = { .owner = THIS_MODULE, .name = "f2fs", - .mount = f2fs_mount, + .init_fs_context = f2fs_init_fs_context, .kill_sb = kill_f2fs_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 75134d69a0bd..f736052dea50 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -628,6 +628,27 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_no_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_valid_thresh_ratio")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -824,6 +845,27 @@ out: return count; } + if (!strcmp(a->attr.name, "reserved_pin_section")) { + if (t > GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi))) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_multiple")) { + if (t < 1 || t > SEGS_PER_SEC(sbi)) + return -EINVAL; + sbi->gc_thread->boost_gc_multiple = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_greedy")) { + if (t > GC_GREEDY) + return -EINVAL; + sbi->gc_thread->boost_gc_greedy = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1050,6 +1092,8 @@ GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio); +GC_THREAD_RW_ATTR(gc_boost_gc_multiple, boost_gc_multiple); +GC_THREAD_RW_ATTR(gc_boost_gc_greedy, boost_gc_greedy); /* SM_INFO ATTR */ SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments); @@ -1130,6 +1174,7 @@ F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); +F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1220,6 +1265,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_no_zoned_gc_percent), ATTR_LIST(gc_boost_zoned_gc_percent), ATTR_LIST(gc_valid_thresh_ratio), + ATTR_LIST(gc_boost_gc_multiple), + ATTR_LIST(gc_boost_gc_greedy), ATTR_LIST(gc_idle), ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), @@ -1323,6 +1370,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), + ATTR_LIST(reserved_pin_section), NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 1db348f8f887..a7061c2ad8e4 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -356,7 +356,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); - fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); + fat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index c7a2d27120ba..950da09f0961 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -158,9 +158,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) mark_inode_dirty(inode); } if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { - fat_fs_error(sb, "clusters badly computed (%d != %llu)", - new_fclus, - (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); + fat_fs_error_ratelimit( + sb, "clusters badly computed (%d != %llu)", new_fclus, + (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); fat_cache_inval_inode(inode); } inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 2203438738f6..76c86f1c2b1c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1071,6 +1071,7 @@ static int ocfs2_grab_folios_for_write(struct address_space *mapping, if (IS_ERR(wc->w_folios[i])) { ret = PTR_ERR(wc->w_folios[i]); mlog_errno(ret); + wc->w_folios[i] = NULL; goto out; } } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 7799f4d16ce9..8c9c4825f984 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -798,6 +798,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } } + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = ocfs2_error(inode->i_sb, + "Inode %lu has empty extent list at depth %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth)); + goto out; + } + found = 0; for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { rec = &el->l_recs[i]; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 67fc62a49a76..00f52812dbb0 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2632,7 +2632,7 @@ again: dlm_reco_master_ready(dlm), msecs_to_jiffies(1000)); if (!dlm_reco_master_ready(dlm)) { - mlog(0, "%s: reco master taking awhile\n", + mlog(0, "%s: reco master taking a while\n", dlm->name); goto again; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 12e5d1f73325..14bf440ea4df 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -50,8 +50,6 @@ struct ocfs2_find_inode_args unsigned int fi_sysfile_type; }; -static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; - static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); @@ -250,14 +248,77 @@ bail: static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = opaque; +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; +#endif inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; - if (args->fi_sysfile_type != 0) +#ifdef CONFIG_LOCKDEP + switch (args->fi_sysfile_type) { + case BAD_BLOCK_SYSTEM_INODE: + break; + case GLOBAL_INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]); + break; + case SLOT_MAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]); + break; + case HEARTBEAT_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]); + break; + case GLOBAL_BITMAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]); + break; + case USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]); + break; + case GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]); + break; + case ORPHAN_DIR_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]); + break; + case EXTENT_ALLOC_SYSTEM_INODE: lockdep_set_class(&inode->i_rwsem, - &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); + &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]); + break; + case INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]); + break; + case JOURNAL_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]); + break; + case LOCAL_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]); + break; + case TRUNCATE_LOG_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]); + break; + case LOCAL_USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]); + break; + case LOCAL_GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]); + break; + default: + WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type); + } if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || @@ -267,6 +328,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) else lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); +#endif return 0; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 369c7d27befd..cbe2f8ed8897 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -617,6 +617,8 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, */ credits += OCFS2_INODE_UPDATE_CREDITS + 1; + inode_lock(tl_inode); + /* * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() * logic, while we still need to lock the global_bitmap. @@ -626,7 +628,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); ret = -EIO; - goto out; + goto out_unlock_tl_inode; } inode_lock(gb_inode); @@ -634,16 +636,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { mlog_errno(ret); - goto out_unlock_gb_mutex; + goto out_unlock_gb_inode; } - inode_lock(tl_inode); - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_unlock_tl_inode; + goto out_unlock; } new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); @@ -703,15 +703,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, out_commit: ocfs2_commit_trans(osb, handle); brelse(gd_bh); - -out_unlock_tl_inode: - inode_unlock(tl_inode); - +out_unlock: ocfs2_inode_unlock(gb_inode, 1); -out_unlock_gb_mutex: +out_unlock_gb_inode: inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); +out_unlock_tl_inode: + inode_unlock(tl_inode); out: if (context->meta_ac) { diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 99278c8f0e24..c90b254da75e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -142,6 +142,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, bail_add: ret = d_splice_alias(inode, dentry); + if (IS_ERR(ret)) + goto bail_unlock; if (inode) { /* @@ -154,15 +156,16 @@ bail_add: * NOTE: This dentry already has ->d_op set from * ocfs2_get_parent() and ocfs2_get_dentry() */ - if (!IS_ERR_OR_NULL(ret)) + if (ret) dentry = ret; status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); if (status) { mlog_errno(status); + if (ret) + dput(ret); ret = ERR_PTR(status); - goto bail_unlock; } } else ocfs2_dentry_attach_gen(dentry); @@ -1452,8 +1455,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; trace_ocfs2_rename_over_existing( - (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? - (unsigned long long)newfe_bh->b_blocknr : 0ULL); + (unsigned long long)newfe_blkno, newfe_bh, + (unsigned long long)newfe_bh->b_blocknr); if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 77edcd70f72c..0f045e45fa0c 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -360,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file, struct ocfs2_control_message_setn *msg) { long nodenum; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; if (ocfs2_control_get_handshake_state(file) != @@ -375,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, return -EINVAL; msg->space = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || @@ -391,7 +389,6 @@ static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) { long major, minor; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; struct ocfs2_protocol_version *max = &ocfs2_user_plugin.sp_max_proto; @@ -409,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - major = simple_strtol(msg->major, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->major, 16, &major)) return -EINVAL; - minor = simple_strtol(msg->minor, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->minor, 16, &minor)) return -EINVAL; /* @@ -441,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file, struct ocfs2_control_message_down *msg) { long nodenum; - char *p = NULL; if (ocfs2_control_get_handshake_state(file) != OCFS2_CONTROL_HANDSHAKE_VALID) @@ -456,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &p, 16); - if (!p || *p) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 10d01eb09c43..f188bd900eb2 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1490,10 +1490,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return -EINVAL; dump = vzalloc(sizeof(*dump)); - if (!dump) { - ret = -ENOMEM; - goto out_err; - } + if (!dump) + return -ENOMEM; /* Keep size of the buffer page aligned so that it can be mmaped */ data_size = roundup(sizeof(struct vmcoredd_header) + data->size, @@ -1519,22 +1517,19 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) dump->size = data_size; /* Add the dump to driver sysfs list and update the elfcore hdr */ - mutex_lock(&vmcore_mutex); - if (vmcore_opened) - pr_warn_once("Unexpected adding of device dump\n"); - if (vmcore_open) { - ret = -EBUSY; - goto unlock; - } + scoped_guard(mutex, &vmcore_mutex) { + if (vmcore_opened) + pr_warn_once("Unexpected adding of device dump\n"); + if (vmcore_open) { + ret = -EBUSY; + goto out_err; + } - list_add_tail(&dump->list, &vmcoredd_list); - vmcoredd_update_size(data_size); - mutex_unlock(&vmcore_mutex); + list_add_tail(&dump->list, &vmcoredd_list); + vmcoredd_update_size(data_size); + } return 0; -unlock: - mutex_unlock(&vmcore_mutex); - out_err: vfree(buf); vfree(dump); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 3061043e915c..b69c294e3ef0 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -80,23 +80,22 @@ static int squashfs_bio_read_cached(struct bio *fullbio, struct address_space *cache_mapping, u64 index, int length, u64 read_start, u64 read_end, int page_count) { - struct page *head_to_cache = NULL, *tail_to_cache = NULL; + struct folio *head_to_cache = NULL, *tail_to_cache = NULL; struct block_device *bdev = fullbio->bi_bdev; int start_idx = 0, end_idx = 0; - struct bvec_iter_all iter_all; + struct folio_iter fi; struct bio *bio = NULL; - struct bio_vec *bv; int idx = 0; int err = 0; #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL - struct page **cache_pages = kmalloc_array(page_count, - sizeof(void *), GFP_KERNEL | __GFP_ZERO); + struct folio **cache_folios = kmalloc_array(page_count, + sizeof(*cache_folios), GFP_KERNEL | __GFP_ZERO); #endif - bio_for_each_segment_all(bv, fullbio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, fullbio) { + struct folio *folio = fi.folio; - if (page->mapping == cache_mapping) { + if (folio->mapping == cache_mapping) { idx++; continue; } @@ -111,13 +110,13 @@ static int squashfs_bio_read_cached(struct bio *fullbio, * adjacent blocks. */ if (idx == 0 && index != read_start) - head_to_cache = page; + head_to_cache = folio; else if (idx == page_count - 1 && index + length != read_end) - tail_to_cache = page; + tail_to_cache = folio; #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL /* Cache all pages in the BIO for repeated reads */ - else if (cache_pages) - cache_pages[idx] = page; + else if (cache_folios) + cache_folios[idx] = folio; #endif if (!bio || idx != end_idx) { @@ -150,45 +149,45 @@ static int squashfs_bio_read_cached(struct bio *fullbio, return err; if (head_to_cache) { - int ret = add_to_page_cache_lru(head_to_cache, cache_mapping, + int ret = filemap_add_folio(cache_mapping, head_to_cache, read_start >> PAGE_SHIFT, GFP_NOIO); if (!ret) { - SetPageUptodate(head_to_cache); - unlock_page(head_to_cache); + folio_mark_uptodate(head_to_cache); + folio_unlock(head_to_cache); } } if (tail_to_cache) { - int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping, + int ret = filemap_add_folio(cache_mapping, tail_to_cache, (read_end >> PAGE_SHIFT) - 1, GFP_NOIO); if (!ret) { - SetPageUptodate(tail_to_cache); - unlock_page(tail_to_cache); + folio_mark_uptodate(tail_to_cache); + folio_unlock(tail_to_cache); } } #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL - if (!cache_pages) + if (!cache_folios) goto out; for (idx = 0; idx < page_count; idx++) { - if (!cache_pages[idx]) + if (!cache_folios[idx]) continue; - int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping, + int ret = filemap_add_folio(cache_mapping, cache_folios[idx], (read_start >> PAGE_SHIFT) + idx, GFP_NOIO); if (!ret) { - SetPageUptodate(cache_pages[idx]); - unlock_page(cache_pages[idx]); + folio_mark_uptodate(cache_folios[idx]); + folio_unlock(cache_folios[idx]); } } - kfree(cache_pages); + kfree(cache_folios); out: #endif return 0; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 5ca2baa16dc2..ce7d661d5ad8 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -493,10 +493,9 @@ out: return res; } -static int squashfs_readahead_fragment(struct page **page, +static int squashfs_readahead_fragment(struct inode *inode, struct page **page, unsigned int pages, unsigned int expected, loff_t start) { - struct inode *inode = page[0]->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); @@ -605,8 +604,8 @@ static void squashfs_readahead(struct readahead_control *ractl) if (start >> msblk->block_log == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { - res = squashfs_readahead_fragment(pages, nr_pages, - expected, start); + res = squashfs_readahead_fragment(inode, pages, + nr_pages, expected, start); if (res) goto skip_pages; continue; diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 32048052c64a..5d07c469b571 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -127,6 +127,8 @@ #define __diag_GCC_8(s) #endif +#define __diag_GCC_all(s) __diag(s) + #define __diag_ignore_all(option, comment) \ __diag(__diag_GCC_ignore option) diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 1fe7e7d1b214..7b44b41d0a20 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -13,10 +13,23 @@ */ extern struct resource crashk_res; extern struct resource crashk_low_res; +extern struct range crashk_cma_ranges[]; +#if defined(CONFIG_CMA) && defined(CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION) +#define CRASHKERNEL_CMA +#define CRASHKERNEL_CMA_RANGES_MAX 4 +extern int crashk_cma_cnt; +#else +#define crashk_cma_cnt 0 +#define CRASHKERNEL_CMA_RANGES_MAX 0 +#endif + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); + unsigned long long *low_size, unsigned long long *cma_size, + bool *high); + +void __init reserve_crashkernel_cma(unsigned long long cma_size); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5206d63b3386..2f8b8bfc0e73 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -268,7 +268,7 @@ struct node_footer { /* Node IDs in an Indirect Block */ #define NIDS_PER_BLOCK ((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32)) -#define ADDRS_PER_PAGE(page, inode) (addrs_per_page(inode, IS_INODE(page))) +#define ADDRS_PER_PAGE(folio, inode) (addrs_per_page(inode, IS_INODE(folio))) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 8d0e3ad89b94..10dd161690a2 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -332,12 +332,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return (struct page *)page_private(bounce_page); } -static inline bool fscrypt_is_bounce_folio(struct folio *folio) +static inline bool fscrypt_is_bounce_folio(const struct folio *folio) { return folio->mapping == NULL; } -static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) +static inline +struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio) { return bounce_folio->private; } @@ -517,12 +518,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return ERR_PTR(-EINVAL); } -static inline bool fscrypt_is_bounce_folio(struct folio *folio) +static inline bool fscrypt_is_bounce_folio(const struct folio *folio) { return false; } -static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) +static inline +struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); diff --git a/include/linux/gcd.h b/include/linux/gcd.h index cb572677fd7f..616e81a7f7e3 100644 --- a/include/linux/gcd.h +++ b/include/linux/gcd.h @@ -3,6 +3,9 @@ #define _GCD_H #include <linux/compiler.h> +#include <linux/jump_label.h> + +DECLARE_STATIC_KEY_TRUE(efficient_ffs_key); unsigned long gcd(unsigned long a, unsigned long b) __attribute_const__; diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h index 1bc2b3244613..34e615c76ca5 100644 --- a/include/linux/hung_task.h +++ b/include/linux/hung_task.h @@ -21,17 +21,17 @@ * type. * * Type encoding: - * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) - * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) - * 10 - Blocked on rt-mutex (BLOCKER_TYPE_RTMUTEX) - * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM) + * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) + * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) + * 10 - Blocked on rw-semaphore as READER (BLOCKER_TYPE_RWSEM_READER) + * 11 - Blocked on rw-semaphore as WRITER (BLOCKER_TYPE_RWSEM_WRITER) */ -#define BLOCKER_TYPE_MUTEX 0x00UL -#define BLOCKER_TYPE_SEM 0x01UL -#define BLOCKER_TYPE_RTMUTEX 0x02UL -#define BLOCKER_TYPE_RWSEM 0x03UL +#define BLOCKER_TYPE_MUTEX 0x00UL +#define BLOCKER_TYPE_SEM 0x01UL +#define BLOCKER_TYPE_RWSEM_READER 0x02UL +#define BLOCKER_TYPE_RWSEM_WRITER 0x03UL -#define BLOCKER_TYPE_MASK 0x03UL +#define BLOCKER_TYPE_MASK 0x03UL #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER static inline void hung_task_set_blocker(void *lock, unsigned long type) diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index b674f64d0822..7f136de4b73e 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -245,7 +245,7 @@ void i3c_driver_unregister(struct i3c_driver *drv); * * Return: 0 if both registrations succeeds, a negative error code otherwise. */ -static inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv, +static __always_inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv, struct i2c_driver *i2cdrv) { int ret; @@ -270,7 +270,7 @@ static inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv, * Note that when CONFIG_I3C is not enabled, this function only unregisters the * @i2cdrv. */ -static inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, +static __always_inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, struct i2c_driver *i2cdrv) { if (IS_ENABLED(CONFIG_I3C)) diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index c67922ece617..043f5c7ff398 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -249,10 +249,15 @@ struct i3c_device { */ #define I3C_BUS_MAX_DEVS 11 -#define I3C_BUS_MAX_I3C_SCL_RATE 12900000 -#define I3C_BUS_TYP_I3C_SCL_RATE 12500000 -#define I3C_BUS_I2C_FM_PLUS_SCL_RATE 1000000 -#define I3C_BUS_I2C_FM_SCL_RATE 400000 +/* Taken from the I3C Spec V1.1.1, chapter 6.2. "Timing specification" */ +#define I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE 1000000 +#define I3C_BUS_I2C_FM_SCL_MAX_RATE 400000 +#define I3C_BUS_I3C_SCL_MAX_RATE 12900000 +#define I3C_BUS_I3C_SCL_TYP_RATE 12500000 +#define I3C_BUS_TAVAL_MIN_NS 1000 +#define I3C_BUS_TBUF_MIXED_FM_MIN_NS 1300 +#define I3C_BUS_THIGH_MIXED_MAX_NS 41 +#define I3C_BUS_TIDLE_MIN_NS 200000 #define I3C_BUS_TLOW_OD_MIN_NS 200 /** diff --git a/include/linux/jhash.h b/include/linux/jhash.h index fa26a2dd3b52..7c1c1821c694 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -24,7 +24,7 @@ * Jozsef */ #include <linux/bitops.h> -#include <linux/unaligned/packed_struct.h> +#include <linux/unaligned.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) @@ -77,9 +77,9 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { - a += __get_unaligned_cpu32(k); - b += __get_unaligned_cpu32(k + 4); - c += __get_unaligned_cpu32(k + 8); + a += get_unaligned((u32 *)k); + b += get_unaligned((u32 *)(k + 4)); + c += get_unaligned((u32 *)(k + 8)); __jhash_mix(a, b, c); length -= 12; k += 12; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 03f85ad03025..1b10a5d84b68 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes; typedef unsigned long kimage_entry_t; +/* + * This is a copy of the UAPI struct kexec_segment and must be identical + * to it because it gets copied straight from user space into kernel + * memory. Do not modify this structure unless you change the way segments + * get ingested from user space. + */ struct kexec_segment { /* * This pointer can point to user memory if kexec_load() system @@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image); * @buf_align: Minimum alignment needed. * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. + * @cma: CMA page if the buffer is backed by CMA. * @top_down: Allocate from top of memory. * @random: Place the buffer at a random position. */ @@ -184,6 +191,7 @@ struct kexec_buf { unsigned long buf_align; unsigned long buf_min; unsigned long buf_max; + struct page *cma; bool top_down; #ifdef CONFIG_CRASH_DUMP bool random; @@ -340,6 +348,7 @@ struct kimage { unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + struct page *segment_cma[KEXEC_SEGMENT_MAX]; struct list_head control_pages; struct list_head dest_pages; @@ -361,6 +370,7 @@ struct kimage { */ unsigned int hotplug_support:1; #endif + unsigned int no_cma:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; diff --git a/include/linux/module.h b/include/linux/module.h index a7cac01d95e7..313ecb8e5181 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -33,7 +33,7 @@ #include <linux/percpu.h> #include <asm/module.h> -#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN +#define MODULE_NAME_LEN __MODULE_NAME_LEN struct modversion_info { unsigned long crc; @@ -303,23 +303,6 @@ static typeof(name) __mod_device_table__##type##__##name \ struct notifier_block; -#ifdef CONFIG_MODULES - -/* Get/put a kernel symbol (calls must be symmetric) */ -void *__symbol_get(const char *symbol); -void *__symbol_get_gpl(const char *symbol); -#define symbol_get(x) ({ \ - static const char __notrim[] \ - __used __section(".no_trim_symbol") = __stringify(x); \ - (typeof(&x))(__symbol_get(__stringify(x))); }) - -/* modules using other modules: kdb wants to see this. */ -struct module_use { - struct list_head source_list; - struct list_head target_list; - struct module *source, *target; -}; - enum module_state { MODULE_STATE_LIVE, /* Normal state. */ MODULE_STATE_COMING, /* Full formed, running module_init. */ @@ -604,6 +587,16 @@ struct module { #define MODULE_ARCH_INIT {} #endif +#ifdef CONFIG_MODULES + +/* Get/put a kernel symbol (calls must be symmetric) */ +void *__symbol_get(const char *symbol); +void *__symbol_get_gpl(const char *symbol); +#define symbol_get(x) ({ \ + static const char __notrim[] \ + __used __section(".no_trim_symbol") = __stringify(x); \ + (typeof(&x))(__symbol_get(__stringify(x))); }) + #ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) { diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index bfb85fd13e1f..a04a2bc4f51e 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -6,6 +6,13 @@ #include <linux/stringify.h> #include <linux/kernel.h> +/* + * The maximum module name length, including the NUL byte. + * Chosen so that structs with an unsigned long line up, specifically + * modversion_info. + */ +#define __MODULE_NAME_LEN (64 - sizeof(unsigned long)) + /* You can override this manually, but generally this should match the module name. */ #ifdef MODULE @@ -17,9 +24,6 @@ #define __MODULE_INFO_PREFIX KBUILD_MODNAME "." #endif -/* Chosen so that structs with an unsigned long line up. */ -#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) - #define __MODULE_INFO(tag, name, info) \ static const char __UNIQUE_ID(name)[] \ __used __section(".modinfo") __aligned(1) \ @@ -282,10 +286,9 @@ struct kparam_array #define __moduleparam_const const #endif -/* This is the fundamental function for registering boot/module - parameters. */ +/* This is the fundamental function for registering boot/module parameters. */ #define __module_param_call(prefix, name, ops, arg, perm, level, flags) \ - /* Default value instead of permissions? */ \ + static_assert(sizeof(""prefix) - 1 <= __MODULE_NAME_LEN); \ static const char __param_str_##name[] = prefix #name; \ static struct kernel_param __moduleparam_const __param_##name \ __used __section("__param") \ diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 72ff44cca864..2467b3be15c9 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -11,8 +11,13 @@ #ifdef __KERNEL__ #include <linux/blkdev.h> +#include <linux/mm.h> -extern const char raid6_empty_zero_page[PAGE_SIZE]; +/* This should be const but the raid6 code is too convoluted for that. */ +static inline void *raid6_get_zero_page(void) +{ + return page_address(ZERO_PAGE(0)); +} #else /* ! __KERNEL__ */ /* Used for testing in user space */ @@ -191,6 +196,11 @@ static inline uint32_t raid6_jiffies(void) return tv.tv_sec*1000 + tv.tv_usec/1000; } +static inline void *raid6_get_zero_page(void) +{ + return raid6_empty_zero_page; +} + #endif /* ! __KERNEL__ */ #endif /* LINUX_RAID_RAID6_H */ diff --git a/include/linux/relay.h b/include/linux/relay.h index b3224111d074..6772a7075840 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -29,6 +29,22 @@ #define RELAYFS_CHANNEL_VERSION 7 /* + * Relay buffer statistics + */ +enum { + RELAY_STATS_BUF_FULL = (1 << 0), + RELAY_STATS_WRT_BIG = (1 << 1), + + RELAY_STATS_LAST = RELAY_STATS_WRT_BIG, +}; + +struct rchan_buf_stats +{ + unsigned int full_count; /* counter for buffer full */ + unsigned int big_count; /* counter for too big to write */ +}; + +/* * Per-cpu relay channel buffer */ struct rchan_buf @@ -43,11 +59,11 @@ struct rchan_buf struct irq_work wakeup_work; /* reader wakeup */ struct dentry *dentry; /* channel file dentry */ struct kref kref; /* channel buffer refcount */ + struct rchan_buf_stats stats; /* buffer stats */ struct page **page_array; /* array of current buffer pages */ unsigned int page_count; /* number of current buffer pages */ unsigned int finalized; /* buffer has been finalized */ size_t *padding; /* padding counts per sub-buffer */ - size_t prev_padding; /* temporary variable */ size_t bytes_consumed; /* bytes consumed in cur read subbuf */ size_t early_bytes; /* bytes consumed before VFS inited */ unsigned int cpu; /* this buf's cpu */ @@ -65,7 +81,6 @@ struct rchan const struct rchan_callbacks *cb; /* client callbacks */ struct kref kref; /* channel refcount */ void *private_data; /* for user-defined data */ - size_t last_toobig; /* tried to log event > subbuf size */ struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */ int is_global; /* One global buffer ? */ struct list_head list; /* for channel list */ @@ -84,7 +99,6 @@ struct rchan_callbacks * @buf: the channel buffer containing the new sub-buffer * @subbuf: the start of the new sub-buffer * @prev_subbuf: the start of the previous sub-buffer - * @prev_padding: unused space at the end of previous sub-buffer * * The client should return 1 to continue logging, 0 to stop * logging. @@ -100,8 +114,7 @@ struct rchan_callbacks */ int (*subbuf_start) (struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding); + void *prev_subbuf); /* * create_buf_file - create file to represent a relay channel buffer @@ -161,6 +174,7 @@ struct rchan *relay_open(const char *base_filename, void *private_data); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); +size_t relay_stats(struct rchan *chan, int flags); extern void relay_subbufs_consumed(struct rchan *chan, unsigned int cpu, size_t consumed); diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index bc90c3c7b5fd..876358cfe1b1 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -144,6 +144,9 @@ int ring_buffer_write(struct trace_buffer *buffer, void ring_buffer_nest_start(struct trace_buffer *buffer); void ring_buffer_nest_end(struct trace_buffer *buffer); +DEFINE_GUARD(ring_buffer_nest, struct trace_buffer *, + ring_buffer_nest_start(_T), ring_buffer_nest_end(_T)) + struct ring_buffer_event * ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events); diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h index 5a41c3bbcbe3..01da4582db6d 100644 --- a/include/linux/rtc/ds1685.h +++ b/include/linux/rtc/ds1685.h @@ -8,7 +8,7 @@ * include larger, battery-backed NV-SRAM, burst-mode access, and an RTC * write counter. * - * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>. + * Copyright (C) 2011-2014 Joshua Kinard <linux@kumba.dev>. * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>. * * References: diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index cbafdc12e743..f1aaf676a874 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -132,6 +132,18 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) return !list_empty(&sem->wait_list); } +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) +/* + * Return just the real task structure pointer of the owner + */ +extern struct task_struct *rwsem_owner(struct rw_semaphore *sem); + +/* + * Return true if the rwsem is owned by a reader. + */ +extern bool is_rwsem_reader_owned(struct rw_semaphore *sem); +#endif + #else /* !CONFIG_PREEMPT_RT */ #include <linux/rwbase_rt.h> diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h index 876130091384..f06f7b785091 100644 --- a/include/linux/sprintf.h +++ b/include/linux/sprintf.h @@ -23,7 +23,7 @@ __scanf(2, 0) int vsscanf(const char *, const char *, va_list); /* These are for specific cases, do not use without real need */ extern bool no_hash_pointers; -int no_hash_pointers_enable(char *str); +void hash_pointers_finalize(bool slub_debug); /* Used for Rust formatting ('%pA') */ char *rust_fmt_argument(char *buf, char *end, const void *ptr); diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h new file mode 100644 index 000000000000..89d77dc4f2ed --- /dev/null +++ b/include/linux/sys_info.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SYS_INFO_H +#define _LINUX_SYS_INFO_H + +#include <linux/sysctl.h> + +/* + * SYS_INFO_PANIC_CONSOLE_REPLAY is for panic case only, as it needs special + * handling which only fits panic case. + */ +#define SYS_INFO_TASKS 0x00000001 +#define SYS_INFO_MEM 0x00000002 +#define SYS_INFO_TIMERS 0x00000004 +#define SYS_INFO_LOCKS 0x00000008 +#define SYS_INFO_FTRACE 0x00000010 +#define SYS_INFO_PANIC_CONSOLE_REPLAY 0x00000020 +#define SYS_INFO_ALL_CPU_BT 0x00000040 +#define SYS_INFO_BLOCKED_TASKS 0x00000080 + +void sys_info(unsigned long si_mask); +unsigned long sys_info_parse_param(char *str); + +#ifdef CONFIG_SYSCTL +int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, + void *buffer, size_t *lenp, + loff_t *ppos); +#endif +#endif /* _LINUX_SYS_INFO_H */ diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index df42511438d0..27f57eca8cb1 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -178,32 +178,6 @@ struct xxh64_state { void xxh32_reset(struct xxh32_state *state, uint32_t seed); /** - * xxh32_update() - hash the data given and update the xxh32 state - * - * @state: The xxh32 state to update. - * @input: The data to hash. - * @length: The length of the data to hash. - * - * After calling xxh32_reset() call xxh32_update() as many times as necessary. - * - * Return: Zero on success, otherwise an error code. - */ -int xxh32_update(struct xxh32_state *state, const void *input, size_t length); - -/** - * xxh32_digest() - produce the current xxh32 hash - * - * @state: Produce the current xxh32 hash of this state. - * - * A hash value can be produced at any time. It is still possible to continue - * inserting input into the hash state after a call to xxh32_digest(), and - * generate new hashes later on, by calling xxh32_digest() again. - * - * Return: The xxh32 hash stored in the state. - */ -uint32_t xxh32_digest(const struct xxh32_state *state); - -/** * xxh64_reset() - reset the xxh64 state to start a new hashing operation * * @state: The xxh64 state to reset. diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 5ae1741ea8ea..8958ebfcff94 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -27,6 +27,7 @@ #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 +#define KEXEC_FILE_NO_CMA 0x00000010 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 1c23e6387f13..7dab04cf4a36 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -178,7 +178,7 @@ int xenbus_printf(struct xenbus_transaction t, * sprintf-style type string, and pointer. Returns 0 or errno.*/ int xenbus_gather(struct xenbus_transaction t, const char *dir, ...); -/* notifer routines for when the xenstore comes up */ +/* notifier routines for when the xenstore comes up */ extern int xenstored_ready; int register_xenstore_notifier(struct notifier_block *nb); void unregister_xenstore_notifier(struct notifier_block *nb); diff --git a/init/Kconfig b/init/Kconfig index 2357458fb451..836320251219 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -172,6 +172,10 @@ menu "General setup" config BROKEN bool + help + This option allows you to choose whether you want to try to + compile (and fix) old drivers that haven't been updated to + new infrastructure. config BROKEN_ON_SMP bool diff --git a/init/main.c b/init/main.c index f9f401b6fdfb..0ee0ee7b7c2c 100644 --- a/init/main.c +++ b/init/main.c @@ -1587,7 +1587,11 @@ static noinline void __init kernel_init_freeable(void) * check if there is an early userspace init. If yes, let it do all * the work */ - if (init_eaccess(ramdisk_execute_command) != 0) { + int ramdisk_command_access; + ramdisk_command_access = init_eaccess(ramdisk_execute_command); + if (ramdisk_command_access != 0) { + pr_warn("check access for rdinit=%s failed: %i, ignoring\n", + ramdisk_execute_command, ramdisk_command_access); ramdisk_execute_command = NULL; prepare_namespace(); } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 335b8425dd4b..a4ef79591eb2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -21,6 +21,7 @@ #include <linux/reboot.h> #include <linux/btf.h> #include <linux/objtool.h> +#include <linux/delay.h> #include <asm/page.h> #include <asm/sections.h> @@ -33,6 +34,11 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +/* time to wait for possible DMA to finish before starting the kdump kernel + * when a CMA reservation is used + */ +#define CMA_DMA_TIMEOUT_SEC 10 + #ifdef CONFIG_CRASH_DUMP int kimage_crash_copy_vmcoreinfo(struct kimage *image) @@ -97,6 +103,14 @@ int kexec_crash_loaded(void) } EXPORT_SYMBOL_GPL(kexec_crash_loaded); +static void crash_cma_clear_pending_dma(void) +{ + if (!crashk_cma_cnt) + return; + + mdelay(CMA_DMA_TIMEOUT_SEC * 1000); +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU @@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); + crash_cma_clear_pending_dma(); machine_kexec(kexec_crash_image); } kexec_unlock(); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index acb6bf42e30d..87bf4d41eabb 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -14,6 +14,8 @@ #include <linux/cpuhotplug.h> #include <linux/memblock.h> #include <linux/kmemleak.h> +#include <linux/cma.h> +#include <linux/crash_reserve.h> #include <asm/page.h> #include <asm/sections.h> @@ -172,17 +174,19 @@ static int __init parse_crashkernel_simple(char *cmdline, #define SUFFIX_HIGH 0 #define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 +#define SUFFIX_CMA 2 +#define SUFFIX_NULL 3 static __initdata char *suffix_tbl[] = { [SUFFIX_HIGH] = ",high", [SUFFIX_LOW] = ",low", + [SUFFIX_CMA] = ",cma", [SUFFIX_NULL] = NULL, }; /* * That function parses "suffix" crashkernel command lines like * - * crashkernel=size,[high|low] + * crashkernel=size,[high|low|cma] * * It returns 0 on success and -EINVAL on failure. */ @@ -298,9 +302,11 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, + unsigned long long *cma_size, bool *high) { int ret; + unsigned long long __always_unused cma_base; /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, @@ -331,6 +337,14 @@ int __init parse_crashkernel(char *cmdline, *high = true; } + + /* + * optional CMA reservation + * cma_base is ignored + */ + if (cma_size) + __parse_crashkernel(cmdline, 0, cma_size, + &cma_base, suffix_tbl[SUFFIX_CMA]); #endif if (!*crash_size) ret = -EINVAL; @@ -457,6 +471,56 @@ retry: #endif } +struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX]; +#ifdef CRASHKERNEL_CMA +int crashk_cma_cnt; +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + unsigned long long request_size = roundup(cma_size, PAGE_SIZE); + unsigned long long reserved_size = 0; + + if (!cma_size) + return; + + while (cma_size > reserved_size && + crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) { + + struct cma *res; + + if (cma_declare_contiguous(0, request_size, 0, 0, 0, false, + "crashkernel", &res)) { + /* reservation failed, try half-sized blocks */ + if (request_size <= PAGE_SIZE) + break; + + request_size = roundup(request_size / 2, PAGE_SIZE); + continue; + } + + crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res); + crashk_cma_ranges[crashk_cma_cnt].end = + crashk_cma_ranges[crashk_cma_cnt].start + + cma_get_size(res) - 1; + ++crashk_cma_cnt; + reserved_size += request_size; + } + + if (cma_size > reserved_size) + pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n", + cma_size >> 20, reserved_size >> 20, crashk_cma_cnt); + else + pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n", + reserved_size >> 20, crashk_cma_cnt); +} + +#else /* CRASHKERNEL_CMA */ +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + if (cma_size) + pr_warn("crashkernel CMA reservation not supported\n"); +} +#endif + #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f774367c8e71..7ca1940607bd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -580,8 +580,8 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); + if (ret < 0 && ref_ctr_updated) + update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) diff --git a/kernel/exit.c b/kernel/exit.c index 1d8c8ac33c4f..343eb97543d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -693,12 +693,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, } /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) diff --git a/kernel/fork.c b/kernel/fork.c index e45354cc7cac..9ce93fd20f82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -189,33 +189,33 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +/* + * Allocated stacks are cached and later reused by new threads, so memcg + * accounting is performed by the code assigning/releasing stacks to tasks. + * We need a zeroed memory without __GFP_ACCOUNT. + */ +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } return false; @@ -224,11 +224,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -241,32 +242,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -274,55 +275,47 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (!vm_area) continue; /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); return -ENOMEM; } - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } - /* - * Allocated stacks are cached and later reused by new threads, - * so memcg accounting is performed manually on assigning/releasing - * stacks to tasks. Drop __GFP_ACCOUNT. - */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, - THREADINFO_GFP & ~__GFP_ACCOUNT, + GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -331,7 +324,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -346,7 +339,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -378,8 +377,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -418,7 +416,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -438,11 +437,11 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -458,12 +457,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d2432df2b905..8708a1205f82 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -23,6 +23,7 @@ #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> #include <linux/hung_task.h> +#include <linux/rwsem.h> #include <trace/events/sched.h> @@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; + const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); @@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task) switch (blocker_type) { case BLOCKER_TYPE_MUTEX: - owner = mutex_get_owner( - (struct mutex *)hung_task_blocker_to_lock(blocker)); + owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_SEM: - owner = sem_last_holder( - (struct semaphore *)hung_task_blocker_to_lock(blocker)); + owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + owner = (unsigned long)rwsem_owner( + hung_task_blocker_to_lock(blocker)); + rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? + "reader" : "writer"; + rwsem_blocked_by = is_rwsem_reader_owned( + hung_task_blocker_to_lock(blocker)) ? + "reader" : "writer"; break; default: WARN_ON_ONCE(1); @@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", task->comm, task->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", + task->comm, task->pid); + break; } return; } @@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", task->comm, task->pid, t->comm, t->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", + task->comm, task->pid, rwsem_blocked_as, t->comm, + t->pid, rwsem_blocked_by); + break; } sched_show_task(t); return; diff --git a/kernel/kcov.c b/kernel/kcov.c index 187ba1b80bda..1d85597057e1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg) /* * Fault in a lazily-faulted vmalloc area before it can be used by - * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the * vmalloc fault handling path is instrumented. */ static void kcov_fault_in_area(struct kcov *kcov) diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 351cd7d76dfa..31203f0bacaf 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> +#include <linux/dma-map-ops.h> #include <asm/page.h> #include <asm/sections.h> @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + unsigned long maddr; + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -787,13 +872,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -858,18 +943,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b835033c65eb..91d46502a817 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,6 +26,7 @@ #include <linux/kernel_read_file.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> +#include <linux/dma-map-ops.h> #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (ret <= 0) return ret; + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 30a733a55a67..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); diff --git a/kernel/kthread.c b/kernel/kthread.c index 85fc068f0083..0e98b228a8ef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k) /* * Variant of to_kthread() that doesn't assume @p is a kthread. * - * Per construction; when: + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. * - * (p->flags & PF_KTHREAD) && p->worker_private - * - * the task is both a kthread and struct kthread is persistent. However - * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and - * begin_new_exec()). + * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8572dba95af4..24df4d98f7d2 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <linux/hung_task.h> #include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT @@ -181,11 +182,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } -#ifdef CONFIG_DEBUG_RWSEMS +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); @@ -194,7 +195,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) /* * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +bool is_rwsem_reader_owned(struct rw_semaphore *sem) { /* * Check the count to see if it is write-locked. @@ -207,10 +208,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) } /* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { @@ -1063,10 +1064,13 @@ queue: wake_up_q(&wake_q); trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); /* wait to be given the lock */ for (;;) { - set_current_state(state); if (!smp_load_acquire(&waiter.task)) { /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; @@ -1081,8 +1085,12 @@ queue: } schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); + set_current_state(state); } + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); trace_contention_end(sem, 0); @@ -1144,6 +1152,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) set_current_state(state); trace_contention_begin(sem, LCB_F_WRITE); + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1177,6 +1188,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) trylock_again: raw_spin_lock_irq(&sem->wait_lock); } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 51ddd8866ef3..618202578b42 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -112,6 +112,13 @@ struct find_symbol_arg { enum mod_license license; }; +/* modules using other modules */ +struct module_use { + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; +}; + int mod_verify_sig(const void *mod, struct load_info *info); int try_to_force_load(struct module *mod, const char *reason); bool find_symbol(struct find_symbol_arg *fsa); diff --git a/kernel/module/main.c b/kernel/module/main.c index 81f9df8859dc..7f8bb51aedd4 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -608,7 +608,7 @@ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); static struct { - char name[MODULE_NAME_LEN + 1]; + char name[MODULE_NAME_LEN]; char taints[MODULE_FLAGS_BUF_SIZE]; } last_unloaded_module; @@ -779,14 +779,16 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, struct module *mod; char name[MODULE_NAME_LEN]; char buf[MODULE_FLAGS_BUF_SIZE]; - int ret, forced = 0; + int ret, len, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; + len = strncpy_from_user(name, name_user, MODULE_NAME_LEN); + if (len == 0 || len == MODULE_NAME_LEN) + return -ENOENT; + if (len < 0) + return len; audit_log_kern_module(name); diff --git a/kernel/panic.c b/kernel/panic.c index 43817111c979..72fcbb5a071b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,7 @@ #include <linux/sysfs.h> #include <linux/context_tracking.h> #include <linux/seq_buf.h> +#include <linux/sys_info.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -63,20 +64,13 @@ int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); -#define PANIC_PRINT_TASK_INFO 0x00000001 -#define PANIC_PRINT_MEM_INFO 0x00000002 -#define PANIC_PRINT_TIMER_INFO 0x00000004 -#define PANIC_PRINT_LOCK_INFO 0x00000008 -#define PANIC_PRINT_FTRACE_INFO 0x00000010 -#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 -#define PANIC_PRINT_ALL_CPU_BT 0x00000040 -#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -128,6 +122,13 @@ static int proc_taint(const struct ctl_table *table, int write, return err; } +static int sysctl_panic_print_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} + static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { @@ -165,7 +166,7 @@ static const struct ctl_table kern_panic_table[] = { .data = &panic_print, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = sysctl_panic_print_handler, }, { .procname = "panic_on_warn", @@ -193,6 +194,13 @@ static const struct ctl_table kern_panic_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_sys_info", + .data = &panic_print, + .maxlen = sizeof(panic_print), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static __init int kernel_panic_sysctls_init(void) @@ -203,6 +211,15 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */ +static int __init setup_panic_sys_info(char *buf) +{ + /* There is no risk of race in kernel boot phase */ + panic_print = sys_info_parse_param(buf); + return 1; +} +__setup("panic_sys_info=", setup_panic_sys_info); + static atomic_t warn_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS @@ -298,33 +315,6 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_print_sys_info(bool console_flush) -{ - if (console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); - return; - } - - if (panic_print & PANIC_PRINT_TASK_INFO) - show_state(); - - if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(); - - if (panic_print & PANIC_PRINT_TIMER_INFO) - sysrq_timer_list_show(); - - if (panic_print & PANIC_PRINT_LOCK_INFO) - debug_show_all_locks(); - - if (panic_print & PANIC_PRINT_FTRACE_INFO) - ftrace_dump(DUMP_ALL); - - if (panic_print & PANIC_PRINT_BLOCKED_TASKS) - show_state_filter(TASK_UNINTERRUPTIBLE); -} - void check_panic_on_warn(const char *origin) { unsigned int limit; @@ -345,7 +335,7 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + if (panic_print & SYS_INFO_ALL_CPU_BT) { /* Temporary allow non-panic CPUs to write their backtraces. */ panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); @@ -468,7 +458,7 @@ void vpanic(const char *fmt, va_list args) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - panic_print_sys_info(false); + sys_info(panic_print); kmsg_dump_desc(KMSG_DUMP_PANIC, buf); @@ -497,7 +487,9 @@ void vpanic(const char *fmt, va_list args) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(true); + if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) || + panic_console_replay) + console_flush_on_panic(CONSOLE_REPLAY_ALL); if (!panic_blink) panic_blink = no_blink; @@ -949,6 +941,7 @@ core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); +core_param(panic_console_replay, panic_console_replay, bool, 0644); static int __init oops_setup(char *s) { diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index bbed41ad29cf..ef282001f200 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool printk_kthreads_ready; extern bool debug_non_panic_cpus; __printf(4, 0) @@ -179,6 +180,7 @@ static inline void nbcon_kthread_wake(struct console *con) #define PRINTKRB_RECORD_MAX 0 #define printk_kthreads_running (false) +#define printk_kthreads_ready (false) /* * In !PRINTK builds we still export console_sem diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..646801813415 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) /** * nbcon_context_try_acquire_direct - Try to acquire directly - * @ctxt: The context of the caller - * @cur: The current console state + * @ctxt: The context of the caller + * @cur: The current console state + * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. @@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) * * Errors: * - * -EPERM: A panic is in progress and this is not the panic CPU. - * Or the current owner or waiter has the same or higher - * priority. No acquire method can be successful in - * this case. + * -EPERM: A panic is in progress and this is neither the panic + * CPU nor is this a reacquire. Or the current owner or + * waiter has the same or higher priority. No acquire + * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, - struct nbcon_state *cur) + struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, do { /* - * Panic does not imply that the console is owned. However, it - * is critical that non-panic CPUs during panic are unable to - * acquire ownership in order to satisfy the assumptions of - * nbcon_waiter_matches(). In particular, the assumption that - * lower priorities are ignored during panic. + * Panic does not imply that the console is owned. However, + * since all non-panic CPUs are stopped during panic(), it + * is safer to have them avoid gaining console ownership. + * + * If this acquire is a reacquire (and an unsafe takeover + * has not previously occurred) then it is allowed to attempt + * a direct acquire in panic. This gives console drivers an + * opportunity to perform any necessary cleanup if they were + * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && + (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; + } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; @@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. - * Events #4 and #5 are not possible due to the other_cpu_in_panic() - * check in nbcon_context_try_acquire_direct(). + * Event #4 occurs when a non-panic CPU reacquires. + * Event #5 is not possible due to the other_cpu_in_panic() check + * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); @@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * wait for a handover in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ + if (other_cpu_in_panic()) + return -EPERM; + /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; @@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console - * @ctxt: The context of the caller + * @ctxt: The context of the caller + * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. @@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) +static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) nbcon_state_read(con, &cur); try_again: - err = nbcon_context_try_acquire_direct(ctxt, &cur); + err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; @@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); - while (!nbcon_context_try_acquire(ctxt)) + while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); @@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) cant_migrate(); } - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* @@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; while (nbcon_seq_read(con) < stop_seq) { @@ -1671,6 +1690,9 @@ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; + /* Synchronize the kthread start. */ + lockdep_assert_console_list_lock_held(); + /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; @@ -1701,12 +1723,15 @@ bool nbcon_alloc(struct console *con) return false; } - if (printk_kthreads_running) { + if (printk_kthreads_ready && !have_boot_console) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } + + /* Might be the first kthread. */ + printk_kthreads_running = true; } } @@ -1716,14 +1741,30 @@ bool nbcon_alloc(struct console *con) /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data + * + * Important: @have_nbcon_console must be updated before calling + * this function. In particular, it can be set only when there + * is still another nbcon console registered. */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; - if (printk_kthreads_running) + /* Synchronize the kthread stop. */ + lockdep_assert_console_list_lock_held(); + + if (printk_kthreads_running) { nbcon_kthread_stop(con); + /* Might be the last nbcon console. + * + * Do not rely on printk_kthreads_check_locked(). It is not + * called in some code paths, see nbcon_free() callers. + */ + if (!have_nbcon_console) + printk_kthreads_running = false; + } + nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ @@ -1762,7 +1803,7 @@ bool nbcon_device_try_acquire(struct console *con) ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1eea80d0648e..0efbcdda9aab 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3574,7 +3574,7 @@ EXPORT_SYMBOL(console_resume); static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ -static bool printk_kthreads_ready __ro_after_init; +bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; @@ -3713,6 +3713,7 @@ static void printk_kthreads_check_locked(void) if (!printk_kthreads_ready) return; + /* Start or stop the legacy kthread when needed. */ if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && @@ -4204,14 +4205,6 @@ static int unregister_console_locked(struct console *console) */ synchronize_srcu(&console_srcu); - if (console->flags & CON_NBCON) - nbcon_free(console); - - console_sysfs_notify(); - - if (console->exit) - res = console->exit(console); - /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. @@ -4232,6 +4225,15 @@ static int unregister_console_locked(struct console *console) if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; + /* @have_nbcon_console must be updated before calling nbcon_free(). */ + if (console->flags & CON_NBCON) + nbcon_free(console); + + console_sysfs_notify(); + + if (console->exit) + res = console->exit(console); + /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); diff --git a/kernel/relay.c b/kernel/relay.c index c0c93a04d4ce..8d915fe98198 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) return NULL; for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); + buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); if (unlikely(!buf->page_array[i])) goto depopulate; set_page_private(buf->page_array[i], (unsigned long)buf); @@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) if (!mem) goto depopulate; - memset(mem, 0, *size); buf->page_count = n_pages; return mem; @@ -250,13 +249,18 @@ EXPORT_SYMBOL_GPL(relay_buf_full); */ static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { + int full = relay_buf_full(buf); + + if (full) + buf->stats.full_count++; + if (!buf->chan->cb->subbuf_start) - return !relay_buf_full(buf); + return !full; return buf->chan->cb->subbuf_start(buf, subbuf, - prev_subbuf, prev_padding); + prev_subbuf); } /** @@ -298,11 +302,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->finalized = 0; buf->data = buf->start; buf->offset = 0; + buf->stats.full_count = 0; + buf->stats.big_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - relay_subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL); } /** @@ -555,9 +561,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) goto toobig; if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; + size_t prev_padding; + + prev_padding = buf->chan->subbuf_size - buf->offset; old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; + buf->padding[old_subbuf] = prev_padding; buf->subbufs_produced++; if (buf->dentry) d_inode(buf->dentry)->i_size += @@ -582,7 +590,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } @@ -595,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) return length; toobig: - buf->chan->last_toobig = length; + buf->stats.big_count++; return 0; } EXPORT_SYMBOL_GPL(relay_switch_subbuf); @@ -655,11 +663,6 @@ void relay_close(struct rchan *chan) if ((buf = *per_cpu_ptr(chan->buf, i))) relay_close_buf(buf); - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%zd) > sub-buffer size (%zd)]\n", - chan->last_toobig, chan->subbuf_size); - list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); @@ -694,6 +697,42 @@ void relay_flush(struct rchan *chan) EXPORT_SYMBOL_GPL(relay_flush); /** + * relay_stats - get channel buffer statistics + * @chan: the channel + * @flags: select particular information to get + * + * Returns the count of certain field that caller specifies. + */ +size_t relay_stats(struct rchan *chan, int flags) +{ + unsigned int i, count = 0; + struct rchan_buf *rbuf; + + if (!chan || flags > RELAY_STATS_LAST) + return 0; + + if (chan->is_global) { + rbuf = *per_cpu_ptr(chan->buf, 0); + if (flags & RELAY_STATS_BUF_FULL) + count = rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count = rbuf->stats.big_count; + } else { + for_each_online_cpu(i) { + rbuf = *per_cpu_ptr(chan->buf, i); + if (rbuf) { + if (flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count += rbuf->stats.big_count; + } + } + } + + return count; +} + +/** * relay_file_open - open file op for relay files * @inode: the inode * @filp: the file diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2024c1d36402..59fdb7ebbf22 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -176,7 +176,7 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static DEFINE_PER_CPU(seqcount_t, psi_seq); +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); static inline void psi_write_begin(int cpu) { @@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_list *t); static void group_init(struct psi_group *group) { - int cpu; - group->enabled = true; - for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 47168d2afbf1..6941145b5058 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; + size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } @@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = { .llseek = noop_llseek, }; -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); @@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, } static const struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; @@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } bt->dev = dev; - atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5176e0270f07..bb71a0dc9d69 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4812,26 +4812,26 @@ int ring_buffer_write(struct trace_buffer *buffer, int ret = -EBUSY; int cpu; - preempt_disable_notrace(); + guard(preempt_notrace)(); if (atomic_read(&buffer->record_disabled)) - goto out; + return -EBUSY; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -EBUSY; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) - goto out; + return -EBUSY; if (length > buffer->max_data_size) - goto out; + return -EBUSY; if (unlikely(trace_recursive_lock(cpu_buffer))) - goto out; + return -EBUSY; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) @@ -4849,10 +4849,6 @@ int ring_buffer_write(struct trace_buffer *buffer, out_unlock: trace_recursive_unlock(cpu_buffer); - - out: - preempt_enable_notrace(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b9716178f728..4283ed4e8f59 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -432,15 +432,13 @@ static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; - preempt_disable_notrace(); + guard(preempt_notrace)(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } - - preempt_enable_notrace(); } static inline void @@ -497,27 +495,18 @@ int register_ftrace_export(struct trace_export *export) if (WARN_ON_ONCE(!export->write)) return -1; - mutex_lock(&ftrace_export_lock); + guard(mutex)(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); - mutex_unlock(&ftrace_export_lock); - return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; + guard(mutex)(&ftrace_export_lock); + return rm_ftrace_export(&ftrace_exports_list, export); } EXPORT_SYMBOL_GPL(unregister_ftrace_export); @@ -640,9 +629,8 @@ void trace_array_put(struct trace_array *this_tr) if (!this_tr) return; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); __trace_array_put(this_tr); - mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); @@ -1160,13 +1148,11 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); - if (!event) { - size = 0; - goto out; - } + if (!event) + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1182,8 +1168,6 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - out: - ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); @@ -1213,7 +1197,6 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); - int ret = 0; if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); @@ -1227,11 +1210,11 @@ int __trace_bputs(unsigned long ip, const char *str) trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) - goto out; + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1240,10 +1223,7 @@ int __trace_bputs(unsigned long ip, const char *str) __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - ret = 1; - out: - ring_buffer_nest_end(buffer); - return ret; + return 1; } EXPORT_SYMBOL_GPL(__trace_bputs); @@ -1432,13 +1412,8 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) int tracing_arm_snapshot(struct trace_array *tr) { - int ret; - - mutex_lock(&trace_types_lock); - ret = tracing_arm_snapshot_locked(tr); - mutex_unlock(&trace_types_lock); - - return ret; + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); } void tracing_disarm_snapshot(struct trace_array *tr) @@ -1841,7 +1816,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; @@ -1855,7 +1830,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; } @@ -1865,8 +1840,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; - ret = read; - goto out; + return read; } } @@ -1874,13 +1848,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; - else { - ret = -EINVAL; - goto out; - } + else + return -EINVAL; + ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; } @@ -1895,15 +1868,11 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { - ret = -EINVAL; - goto out; + return -EINVAL; } *ppos += read; - ret = read; - -out: - return ret; + return read; } /* TODO add a seq_buf_to_buffer() */ @@ -2405,10 +2374,10 @@ int __init register_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) - goto out_unlock; + return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; + return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ @@ -2420,8 +2389,7 @@ int __init register_tracer(struct tracer *type) /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); - out_unlock: - return ret; + return 0; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) @@ -2498,9 +2466,8 @@ void tracing_reset_all_online_cpus_unlocked(void) void tracing_reset_all_online_cpus(void) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); - mutex_unlock(&trace_types_lock); } int is_tracing_stopped(void) @@ -2511,18 +2478,17 @@ int is_tracing_stopped(void) static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; if (tracing_disabled) return; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } - goto out; + return; } /* Prevent the buffers from switching */ @@ -2539,9 +2505,6 @@ static void tracing_start_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2559,11 +2522,10 @@ void tracing_start(void) static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (tr->stop_count++) - goto out; + return; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); @@ -2579,9 +2541,6 @@ static void tracing_stop_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2694,12 +2653,12 @@ void trace_buffered_event_enable(void) per_cpu(trace_buffered_event, cpu) = event; - preempt_disable(); - if (cpu == smp_processor_id() && - __this_cpu_read(trace_buffered_event) != - per_cpu(trace_buffered_event, cpu)) - WARN_ON_ONCE(1); - preempt_enable(); + scoped_guard(preempt,) { + if (cpu == smp_processor_id() && + __this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + } } } @@ -3044,7 +3003,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, skip++; #endif - preempt_disable_notrace(); + guard(preempt_notrace)(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; @@ -3102,8 +3061,6 @@ static void __ftrace_trace_stack(struct trace_array *tr, /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); - preempt_enable_notrace(); - } static inline void ftrace_trace_stack(struct trace_array *tr, @@ -3186,9 +3143,9 @@ ftrace_trace_userstack(struct trace_array *tr, * prevent recursion, since the user stack tracing may * trigger other kernel events. */ - preempt_disable(); + guard(preempt)(); if (__this_cpu_read(user_stack_count)) - goto out; + return; __this_cpu_inc(user_stack_count); @@ -3206,8 +3163,6 @@ ftrace_trace_userstack(struct trace_array *tr, out_drop_count: __this_cpu_dec(user_stack_count); - out: - preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, @@ -3389,7 +3344,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { @@ -3404,26 +3359,23 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; -out: - ring_buffer_nest_end(buffer); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } out_put: put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -3447,7 +3399,7 @@ int __trace_array_vprintk(struct trace_buffer *buffer, pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); @@ -3459,24 +3411,22 @@ int __trace_array_vprintk(struct trace_buffer *buffer, len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, tbuffer, len + 1); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + memcpy(&entry->buf, tbuffer, len + 1); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + } out: - ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -4800,20 +4750,16 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); - ret = -ENODEV; + return -ENODEV; } else { event_file_get(file); } - mutex_unlock(&event_mutex); - if (ret) - return ret; - filp->private_data = inode->i_private; return 0; @@ -5090,7 +5036,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; len = snprintf(NULL, 0, "%*pb\n", @@ -5101,16 +5047,10 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_err; - } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); - -out_err: - kfree(mask_str); + if (len >= count) + return -EINVAL; - return count; + return simple_read_from_buffer(ubuf, count, ppos, mask_str, len); } int tracing_set_cpumask(struct trace_array *tr, @@ -5957,9 +5897,9 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, char buf[MAX_TRACER_SIZE+2]; int r; - mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", tr->current_trace->name); - mutex_unlock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + r = sprintf(buf, "%s\n", tr->current_trace->name); + } return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -6261,15 +6201,13 @@ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); - mutex_unlock(&trace_types_lock); - return ret; } @@ -6566,7 +6504,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) @@ -6610,7 +6548,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) tr->trace_ref++; - mutex_unlock(&trace_types_lock); return ret; fail: @@ -6619,7 +6556,6 @@ fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); - mutex_unlock(&trace_types_lock); return ret; } @@ -6628,14 +6564,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; - mutex_lock(&trace_types_lock); - - tr->trace_ref--; + scoped_guard(mutex, &trace_types_lock) { + tr->trace_ref--; - if (iter->trace->pipe_close) - iter->trace->pipe_close(iter); - close_pipe_on_cpu(tr, iter->cpu_file); - mutex_unlock(&trace_types_lock); + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + close_pipe_on_cpu(tr, iter->cpu_file); + } free_trace_iter_content(iter); kfree(iter); @@ -7438,7 +7373,7 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr->clock_id = i; @@ -7462,8 +7397,6 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tscratch->clock_id = i; } - mutex_unlock(&trace_types_lock); - return 0; } @@ -7515,15 +7448,13 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8111,14 +8042,14 @@ static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); free_tracing_log_err(err); } tr->n_err_log_entries = 0; - mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) @@ -8389,7 +8320,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); iter->tr->trace_ref--; @@ -8400,8 +8331,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) info->spare_cpu, info->spare); kvfree(info); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8609,14 +8538,13 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned * An ioctl call with cmd 0 to the ring buffer file will wake up all * waiters */ - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); - mutex_unlock(&trace_types_lock); return 0; } @@ -8957,12 +8885,12 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, out_reg: ret = tracing_arm_snapshot(tr); if (ret < 0) - goto out; + return ret; ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret < 0) tracing_disarm_snapshot(tr); - out: + return ret < 0 ? ret : 0; } @@ -9106,10 +9034,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); - mutex_unlock(&trace_types_lock); if (ret) return ret; } @@ -9418,7 +9345,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { @@ -9432,7 +9359,6 @@ rb_simple_write(struct file *filp, const char __user *ubuf, /* Wake up any waiters */ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } - mutex_unlock(&trace_types_lock); } (*ppos)++; @@ -9816,10 +9742,9 @@ static void __update_tracer_options(struct trace_array *tr) static void update_tracer_options(struct trace_array *tr) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracer_options_updated = true; __update_tracer_options(tr); - mutex_unlock(&trace_types_lock); } /* Must have trace_types_lock held */ @@ -9841,11 +9766,10 @@ struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); if (tr) tr->ref++; - mutex_unlock(&trace_types_lock); return tr; } @@ -10376,7 +10300,7 @@ bool module_exists(const char *module) { /* All modules have the symbol __this_module */ static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; @@ -10803,7 +10727,8 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)) { - char *kbuf, *buf, *tmp; + char *kbuf __free(kfree) = NULL; + char *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -10818,10 +10743,9 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } + if (copy_from_user(kbuf, buffer + done, size)) + return -EFAULT; + kbuf[size] = '\0'; buf = kbuf; do { @@ -10837,8 +10761,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); - ret = -EINVAL; - goto out; + return -EINVAL; } } done += size; @@ -10851,17 +10774,12 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, ret = createfn(buf); if (ret) - goto out; + return ret; buf += size; } while (done < count); } - ret = done; - -out: - kfree(kbuf); - - return ret; + return done; } #ifdef CONFIG_TRACER_MAX_TRACE @@ -11064,7 +10982,7 @@ __init static int tracer_alloc_buffers(void) BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; + return -ENOMEM; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; @@ -11182,7 +11100,6 @@ out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -out: return ret; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 33cfbd4ed76d..f24ee61f8884 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -536,12 +536,12 @@ static notrace void trace_event_raw_event_synth(void *__data, * is being performed within another event. */ buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + fields_size); if (!entry) - goto out; + return; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { val_idx = var_ref_idx[i]; @@ -584,8 +584,6 @@ static notrace void trace_event_raw_event_synth(void *__data, } trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); } static void free_synth_event_print_fmt(struct trace_event_call *call) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0b3db02030a7..97db0b0ccf3e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -701,6 +701,7 @@ void print_function_args(struct trace_seq *s, unsigned long *args, struct btf *btf; s32 tid, nr = 0; int a, p, x; + u16 encode; trace_seq_printf(s, "("); @@ -744,7 +745,12 @@ void print_function_args(struct trace_seq *s, unsigned long *args, trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_INT: - trace_seq_printf(s, "%ld", arg); + encode = btf_int_encoding(t); + /* Print unsigned ints as hex */ + if (encode & BTF_INT_SIGNED) + trace_seq_printf(s, "%ld", arg); + else + trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_ENUM: trace_seq_printf(s, "%ld", arg); diff --git a/kernel/ucount.c b/kernel/ucount.c index 8686e329b8f2..586af49fc03e 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -199,18 +199,16 @@ void put_ucounts(struct ucounts *ucounts) } } -static inline bool atomic_long_inc_below(atomic_long_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, long u) { - long c, old; - c = atomic_long_read(v); - for (;;) { + long c = atomic_long_read(v); + + do { if (unlikely(c >= u)) return false; - old = atomic_long_cmpxchg(v, c, c+1); - if (likely(old == c)) - return true; - c = old; - } + } while (!atomic_long_try_cmpxchg(v, &c, c+1)); + + return true; } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 53332a1d8af4..dc0e0c6ed075 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3214,6 +3214,26 @@ config TEST_OBJPOOL If unsure, say N. +config TEST_KEXEC_HANDOVER + bool "Test for Kexec HandOver" + default n + depends on KEXEC_HANDOVER + help + This option enables test for Kexec HandOver (KHO). + The test consists of two parts: saving kernel data before kexec and + restoring the data after kexec and verifying that it was properly + handed over. This test module creates and saves data on the boot of + the first kernel and restores and verifies the data on the boot of + kexec'ed kernel. + + For detailed documentation about KHO, see Documentation/core-api/kho. + + To run the test run: + + tools/testing/selftests/kho/vmtest.sh -h + + If unsure, say N. + config RATELIMIT_KUNIT_TEST tristate "KUnit Test for correctness and stress of ratelimit" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index 06b954473222..392ff808c9b9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -40,7 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o iomem_copy.o + buildid.o objpool.o iomem_copy.o sys_info.o lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o @@ -102,6 +102,7 @@ obj-$(CONFIG_TEST_HMM) += test_hmm.o obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o +obj-$(CONFIG_TEST_KEXEC_HANDOVER) += test_kho.o obj-$(CONFIG_TEST_FPU) += test_fpu.o test_fpu-y := test_fpu_glue.o test_fpu_impl.o diff --git a/lib/kunit/test.c b/lib/kunit/test.c index f3c6b11f12b8..d2bfa331a2b1 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -802,7 +802,6 @@ void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites) } EXPORT_SYMBOL_GPL(__kunit_test_suites_exit); -#ifdef CONFIG_MODULES static void kunit_module_init(struct module *mod) { struct kunit_suite_set suite_set, filtered_set; @@ -890,7 +889,6 @@ static struct notifier_block kunit_mod_nb = { .notifier_call = kunit_module_notify, .priority = 0, }; -#endif KUNIT_DEFINE_ACTION_WRAPPER(kfree_action_wrapper, kfree, const void *) @@ -981,20 +979,14 @@ static int __init kunit_init(void) kunit_debugfs_init(); kunit_bus_init(); -#ifdef CONFIG_MODULES return register_module_notifier(&kunit_mod_nb); -#else - return 0; -#endif } late_initcall(kunit_init); static void __exit kunit_exit(void) { memset(&kunit_hooks, 0, sizeof(kunit_hooks)); -#ifdef CONFIG_MODULES unregister_module_notifier(&kunit_mod_nb); -#endif kunit_bus_shutdown(); diff --git a/lib/math/div64.c b/lib/math/div64.c index 5faa29208bdb..bf77b9843175 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -212,12 +212,13 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) #endif - /* make sure c is not zero, trigger exception otherwise */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdiv-by-zero" - if (unlikely(c == 0)) - return 1/0; -#pragma GCC diagnostic pop + /* make sure c is not zero, trigger runtime exception otherwise */ + if (unlikely(c == 0)) { + unsigned long zero = 0; + + OPTIMIZER_HIDE_VAR(zero); + return ~0UL/zero; + } int shift = __builtin_ctzll(c); diff --git a/lib/math/gcd.c b/lib/math/gcd.c index e3b042214d1b..62efca6787ae 100644 --- a/lib/math/gcd.c +++ b/lib/math/gcd.c @@ -11,22 +11,16 @@ * has decent hardware division. */ +DEFINE_STATIC_KEY_TRUE(efficient_ffs_key); + #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) /* If __ffs is available, the even/odd algorithm benchmarks slower. */ -/** - * gcd - calculate and return the greatest common divisor of 2 unsigned longs - * @a: first value - * @b: second value - */ -unsigned long gcd(unsigned long a, unsigned long b) +static unsigned long binary_gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; - if (!a || !b) - return r; - b >>= __ffs(b); if (b == 1) return r & -r; @@ -44,9 +38,15 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#else +#endif /* If normalization is done by loops, the even/odd algorithm is a win. */ + +/** + * gcd - calculate and return the greatest common divisor of 2 unsigned longs + * @a: first value + * @b: second value + */ unsigned long gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; @@ -54,6 +54,11 @@ unsigned long gcd(unsigned long a, unsigned long b) if (!a || !b) return r; +#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) + if (static_branch_likely(&efficient_ffs_key)) + return binary_gcd(a, b); +#endif + /* Isolate lsbit of r */ r &= -r; @@ -80,6 +85,4 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#endif - EXPORT_SYMBOL_GPL(gcd); diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 75ce3e134b7c..799e0e5eac26 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,9 +18,6 @@ #else #include <linux/module.h> #include <linux/gfp.h> -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -EXPORT_SYMBOL(raid6_empty_zero_page); #endif struct raid6_calls raid6_call; diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index a7c1b2bbe40d..b5e47c008b41 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -31,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -72,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index 4e8095403ee2..97d598d2535c 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c index 310c715db313..7986120ca444 100644 --- a/lib/raid6/recov_avx512.c +++ b/lib/raid6/recov_avx512.c @@ -37,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -238,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c index 94aeac85e6f7..93dc515997a1 100644 --- a/lib/raid6/recov_loongarch_simd.c +++ b/lib/raid6/recov_loongarch_simd.c @@ -42,10 +42,10 @@ static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -197,7 +197,7 @@ static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -316,10 +316,10 @@ static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -436,7 +436,7 @@ static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c index 1bfc14174d4d..70e1404c1512 100644 --- a/lib/raid6/recov_neon.c +++ b/lib/raid6/recov_neon.c @@ -36,10 +36,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -74,7 +74,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c index f29303795ccf..5d54c4b437df 100644 --- a/lib/raid6/recov_rvv.c +++ b/lib/raid6/recov_rvv.c @@ -165,10 +165,10 @@ static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -203,7 +203,7 @@ static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c index 4a7aa466f0ef..487018f81192 100644 --- a/lib/raid6/recov_s390xc.c +++ b/lib/raid6/recov_s390xc.c @@ -34,10 +34,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -81,7 +81,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index 4bfa3c6b60de..2e849185c32b 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 73d7b50924ef..de0b0025af2b 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -36,11 +36,11 @@ #include <linux/memblock.h> #include <linux/kasan-enabled.h> -#define DEPOT_POOLS_CAP 8192 -/* The pool_index is offset by 1 so the first record does not have a 0 handle. */ -#define DEPOT_MAX_POOLS \ - (((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \ - (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP) +/* + * The pool_index is offset by 1 so the first record does not have a 0 handle. + */ +static unsigned int stack_max_pools __read_mostly = + MIN((1LL << DEPOT_POOL_INDEX_BITS) - 1, 8192); static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); @@ -62,7 +62,7 @@ static unsigned int stack_bucket_number_order; static unsigned int stack_hash_mask; /* Array of memory regions that store stack records. */ -static void *stack_pools[DEPOT_MAX_POOLS]; +static void **stack_pools; /* Newly allocated pool that is not yet added to stack_pools. */ static void *new_pool; /* Number of pools in stack_pools. */ @@ -101,6 +101,34 @@ static int __init disable_stack_depot(char *str) } early_param("stack_depot_disable", disable_stack_depot); +static int __init parse_max_pools(char *str) +{ + const long long limit = (1LL << (DEPOT_POOL_INDEX_BITS)) - 1; + unsigned int max_pools; + int rv; + + rv = kstrtouint(str, 0, &max_pools); + if (rv) + return rv; + + if (max_pools < 1024) { + pr_err("stack_depot_max_pools below 1024, using default of %u\n", + stack_max_pools); + goto out; + } + + if (max_pools > limit) { + pr_err("stack_depot_max_pools exceeds %lld, using default of %u\n", + limit, stack_max_pools); + goto out; + } + + stack_max_pools = max_pools; +out: + return 0; +} +early_param("stack_depot_max_pools", parse_max_pools); + void __init stack_depot_request_early_init(void) { /* Too late to request early init now. */ @@ -182,6 +210,17 @@ int __init stack_depot_early_init(void) } init_stack_table(entries); + pr_info("allocating space for %u stack pools via memblock\n", + stack_max_pools); + stack_pools = + memblock_alloc(stack_max_pools * sizeof(void *), PAGE_SIZE); + if (!stack_pools) { + pr_err("stack pools allocation failed, disabling\n"); + memblock_free(stack_table, entries * sizeof(struct list_head)); + stack_depot_disabled = true; + return -ENOMEM; + } + return 0; } @@ -231,6 +270,16 @@ int stack_depot_init(void) stack_hash_mask = entries - 1; init_stack_table(entries); + pr_info("allocating space for %u stack pools via kvcalloc\n", + stack_max_pools); + stack_pools = kvcalloc(stack_max_pools, sizeof(void *), GFP_KERNEL); + if (!stack_pools) { + pr_err("stack pools allocation failed, disabling\n"); + kvfree(stack_table); + stack_depot_disabled = true; + ret = -ENOMEM; + } + out_unlock: mutex_unlock(&stack_depot_init_mutex); @@ -245,9 +294,9 @@ static bool depot_init_pool(void **prealloc) { lockdep_assert_held(&pool_lock); - if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { + if (unlikely(pools_num >= stack_max_pools)) { /* Bail out if we reached the pool limit. */ - WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */ + WARN_ON_ONCE(pools_num > stack_max_pools); /* should never happen */ WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */ WARN_ONCE(1, "Stack depot reached limit capacity"); return false; @@ -273,7 +322,7 @@ static bool depot_init_pool(void **prealloc) * NULL; do not reset to NULL if we have reached the maximum number of * pools. */ - if (pools_num < DEPOT_MAX_POOLS) + if (pools_num < stack_max_pools) WRITE_ONCE(new_pool, NULL); else WRITE_ONCE(new_pool, STACK_DEPOT_POISON); diff --git a/lib/sys_info.c b/lib/sys_info.c new file mode 100644 index 000000000000..5bf503fd7ec1 --- /dev/null +++ b/lib/sys_info.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/sched/debug.h> +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/sysctl.h> +#include <linux/nmi.h> + +#include <linux/sys_info.h> + +struct sys_info_name { + unsigned long bit; + const char *name; +}; + +/* + * When 'si_names' gets updated, please make sure the 'sys_info_avail' + * below is updated accordingly. + */ +static const struct sys_info_name si_names[] = { + { SYS_INFO_TASKS, "tasks" }, + { SYS_INFO_MEM, "mem" }, + { SYS_INFO_TIMERS, "timers" }, + { SYS_INFO_LOCKS, "locks" }, + { SYS_INFO_FTRACE, "ftrace" }, + { SYS_INFO_ALL_CPU_BT, "all_bt" }, + { SYS_INFO_BLOCKED_TASKS, "blocked_tasks" }, +}; + +/* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */ +unsigned long sys_info_parse_param(char *str) +{ + unsigned long si_bits = 0; + char *s, *name; + int i; + + s = str; + while ((name = strsep(&s, ",")) && *name) { + for (i = 0; i < ARRAY_SIZE(si_names); i++) { + if (!strcmp(name, si_names[i].name)) { + si_bits |= si_names[i].bit; + break; + } + } + } + + return si_bits; +} + +#ifdef CONFIG_SYSCTL + +static const char sys_info_avail[] __maybe_unused = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks"; + +int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + char names[sizeof(sys_info_avail) + 1]; + struct ctl_table table; + unsigned long *si_bits_global; + + si_bits_global = ro_table->data; + + if (write) { + unsigned long si_bits; + int ret; + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + ret = proc_dostring(&table, write, buffer, lenp, ppos); + if (ret) + return ret; + + si_bits = sys_info_parse_param(names); + /* The access to the global value is not synchronized. */ + WRITE_ONCE(*si_bits_global, si_bits); + return 0; + } else { + /* for 'read' operation */ + char *delim = ""; + int i, len = 0; + + for (i = 0; i < ARRAY_SIZE(si_names); i++) { + if (*si_bits_global & si_names[i].bit) { + len += scnprintf(names + len, sizeof(names) - len, + "%s%s", delim, si_names[i].name); + delim = ","; + } + } + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + return proc_dostring(&table, write, buffer, lenp, ppos); + } +} +#endif + +void sys_info(unsigned long si_mask) +{ + if (si_mask & SYS_INFO_TASKS) + show_state(); + + if (si_mask & SYS_INFO_MEM) + show_mem(); + + if (si_mask & SYS_INFO_TIMERS) + sysrq_timer_list_show(); + + if (si_mask & SYS_INFO_LOCKS) + debug_show_all_locks(); + + if (si_mask & SYS_INFO_FTRACE) + ftrace_dump(DUMP_ALL); + + if (si_mask & SYS_INFO_ALL_CPU_BT) + trigger_all_cpu_backtrace(); + + if (si_mask & SYS_INFO_BLOCKED_TASKS) + show_state_filter(TASK_UNINTERRUPTIBLE); +} diff --git a/lib/test_kho.c b/lib/test_kho.c new file mode 100644 index 000000000000..c2eb899c3b45 --- /dev/null +++ b/lib/test_kho.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test module for KHO + * Copyright (c) 2025 Microsoft Corporation. + * + * Authors: + * Saurabh Sengar <ssengar@microsoft.com> + * Mike Rapoport <rppt@kernel.org> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/libfdt.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/vmalloc.h> +#include <linux/kexec_handover.h> + +#include <net/checksum.h> + +#define KHO_TEST_MAGIC 0x4b484f21 /* KHO! */ +#define KHO_TEST_FDT "kho_test" +#define KHO_TEST_COMPAT "kho-test-v1" + +static long max_mem = (PAGE_SIZE << MAX_PAGE_ORDER) * 2; +module_param(max_mem, long, 0644); + +struct kho_test_state { + unsigned int nr_folios; + struct folio **folios; + struct folio *fdt; + __wsum csum; +}; + +static struct kho_test_state kho_test_state; + +static int kho_test_notifier(struct notifier_block *self, unsigned long cmd, + void *v) +{ + struct kho_test_state *state = &kho_test_state; + struct kho_serialization *ser = v; + int err = 0; + + switch (cmd) { + case KEXEC_KHO_ABORT: + return NOTIFY_DONE; + case KEXEC_KHO_FINALIZE: + /* Handled below */ + break; + default: + return NOTIFY_BAD; + } + + err |= kho_preserve_folio(state->fdt); + err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt)); + + return err ? NOTIFY_BAD : NOTIFY_DONE; +} + +static struct notifier_block kho_test_nb = { + .notifier_call = kho_test_notifier, +}; + +static int kho_test_save_data(struct kho_test_state *state, void *fdt) +{ + phys_addr_t *folios_info __free(kvfree) = NULL; + int err = 0; + + folios_info = kvmalloc_array(state->nr_folios, sizeof(*folios_info), + GFP_KERNEL); + if (!folios_info) + return -ENOMEM; + + for (int i = 0; i < state->nr_folios; i++) { + struct folio *folio = state->folios[i]; + unsigned int order = folio_order(folio); + + folios_info[i] = virt_to_phys(folio_address(folio)) | order; + + err = kho_preserve_folio(folio); + if (err) + return err; + } + + err |= fdt_begin_node(fdt, "data"); + err |= fdt_property(fdt, "nr_folios", &state->nr_folios, + sizeof(state->nr_folios)); + err |= fdt_property(fdt, "folios_info", folios_info, + state->nr_folios * sizeof(*folios_info)); + err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum)); + err |= fdt_end_node(fdt); + + return err; +} + +static int kho_test_prepare_fdt(struct kho_test_state *state) +{ + const char compatible[] = KHO_TEST_COMPAT; + unsigned int magic = KHO_TEST_MAGIC; + ssize_t fdt_size; + int err = 0; + void *fdt; + + fdt_size = state->nr_folios * sizeof(phys_addr_t) + PAGE_SIZE; + state->fdt = folio_alloc(GFP_KERNEL, get_order(fdt_size)); + if (!state->fdt) + return -ENOMEM; + + fdt = folio_address(state->fdt); + + err |= fdt_create(fdt, fdt_size); + err |= fdt_finish_reservemap(fdt); + + err |= fdt_begin_node(fdt, ""); + err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible)); + err |= fdt_property(fdt, "magic", &magic, sizeof(magic)); + err |= kho_test_save_data(state, fdt); + err |= fdt_end_node(fdt); + + err |= fdt_finish(fdt); + + if (err) + folio_put(state->fdt); + + return err; +} + +static int kho_test_generate_data(struct kho_test_state *state) +{ + size_t alloc_size = 0; + __wsum csum = 0; + + while (alloc_size < max_mem) { + int order = get_random_u32() % NR_PAGE_ORDERS; + struct folio *folio; + unsigned int size; + void *addr; + + /* cap allocation so that we won't exceed max_mem */ + if (alloc_size + (PAGE_SIZE << order) > max_mem) { + order = get_order(max_mem - alloc_size); + if (order) + order--; + } + size = PAGE_SIZE << order; + + folio = folio_alloc(GFP_KERNEL | __GFP_NORETRY, order); + if (!folio) + goto err_free_folios; + + state->folios[state->nr_folios++] = folio; + addr = folio_address(folio); + get_random_bytes(addr, size); + csum = csum_partial(addr, size, csum); + alloc_size += size; + } + + state->csum = csum; + return 0; + +err_free_folios: + for (int i = 0; i < state->nr_folios; i++) + folio_put(state->folios[i]); + return -ENOMEM; +} + +static int kho_test_save(void) +{ + struct kho_test_state *state = &kho_test_state; + struct folio **folios __free(kvfree) = NULL; + unsigned long max_nr; + int err; + + max_mem = PAGE_ALIGN(max_mem); + max_nr = max_mem >> PAGE_SHIFT; + + folios = kvmalloc_array(max_nr, sizeof(*state->folios), GFP_KERNEL); + if (!folios) + return -ENOMEM; + state->folios = folios; + + err = kho_test_generate_data(state); + if (err) + return err; + + err = kho_test_prepare_fdt(state); + if (err) + return err; + + return register_kho_notifier(&kho_test_nb); +} + +static int kho_test_restore_data(const void *fdt, int node) +{ + const unsigned int *nr_folios; + const phys_addr_t *folios_info; + const __wsum *old_csum; + __wsum csum = 0; + int len; + + node = fdt_path_offset(fdt, "/data"); + + nr_folios = fdt_getprop(fdt, node, "nr_folios", &len); + if (!nr_folios || len != sizeof(*nr_folios)) + return -EINVAL; + + old_csum = fdt_getprop(fdt, node, "csum", &len); + if (!old_csum || len != sizeof(*old_csum)) + return -EINVAL; + + folios_info = fdt_getprop(fdt, node, "folios_info", &len); + if (!folios_info || len != sizeof(*folios_info) * *nr_folios) + return -EINVAL; + + for (int i = 0; i < *nr_folios; i++) { + unsigned int order = folios_info[i] & ~PAGE_MASK; + phys_addr_t phys = folios_info[i] & PAGE_MASK; + unsigned int size = PAGE_SIZE << order; + struct folio *folio; + + folio = kho_restore_folio(phys); + if (!folio) + break; + + if (folio_order(folio) != order) + break; + + csum = csum_partial(folio_address(folio), size, csum); + folio_put(folio); + } + + if (csum != *old_csum) + return -EINVAL; + + return 0; +} + +static int kho_test_restore(phys_addr_t fdt_phys) +{ + void *fdt = phys_to_virt(fdt_phys); + const unsigned int *magic; + int node, len, err; + + node = fdt_path_offset(fdt, "/"); + if (node < 0) + return -EINVAL; + + if (fdt_node_check_compatible(fdt, node, KHO_TEST_COMPAT)) + return -EINVAL; + + magic = fdt_getprop(fdt, node, "magic", &len); + if (!magic || len != sizeof(*magic)) + return -EINVAL; + + if (*magic != KHO_TEST_MAGIC) + return -EINVAL; + + err = kho_test_restore_data(fdt, node); + if (err) + return err; + + pr_info("KHO restore succeeded\n"); + return 0; +} + +static int __init kho_test_init(void) +{ + phys_addr_t fdt_phys; + int err; + + err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); + if (!err) + return kho_test_restore(fdt_phys); + + if (err != -ENOENT) { + pr_warn("failed to retrieve %s FDT: %d\n", KHO_TEST_FDT, err); + return err; + } + + return kho_test_save(); +} +module_init(kho_test_init); + +static void kho_test_cleanup(void) +{ + for (int i = 0; i < kho_test_state.nr_folios; i++) + folio_put(kho_test_state.folios[i]); + + kvfree(kho_test_state.folios); +} + +static void __exit kho_test_exit(void) +{ + unregister_kho_notifier(&kho_test_nb); + kho_test_cleanup(); +} +module_exit(kho_test_exit); + +MODULE_AUTHOR("Mike Rapoport <rppt@kernel.org>"); +MODULE_DESCRIPTION("KHO test module"); +MODULE_LICENSE("GPL"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3d85800757aa..eb0cb11d0d12 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -60,6 +60,20 @@ bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); +/* + * Hashed pointers policy selected by "hash_pointers=..." boot param + * + * `auto` - Hashed pointers enabled unless disabled by slub_debug_enabled=true + * `always` - Hashed pointers enabled unconditionally + * `never` - Hashed pointers disabled unconditionally + */ +enum hash_pointers_policy { + HASH_PTR_AUTO = 0, + HASH_PTR_ALWAYS, + HASH_PTR_NEVER +}; +static enum hash_pointers_policy hash_pointers_mode __initdata; + noinline static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars) { @@ -1699,10 +1713,9 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, return buf; } -#pragma GCC diagnostic push -#ifndef __clang__ -#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" -#endif +__diag_push(); +__diag_ignore(GCC, all, "-Wsuggest-attribute=format", + "Not a valid __printf() conversion candidate."); static char *va_format(char *buf, char *end, struct va_format *va_fmt, struct printf_spec spec) { @@ -1717,7 +1730,7 @@ static char *va_format(char *buf, char *end, struct va_format *va_fmt, return buf; } -#pragma GCC diagnostic pop +__diag_pop(); static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, @@ -2289,12 +2302,23 @@ char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr, return resource_string(buf, end, ptr, spec, fmt); } -int __init no_hash_pointers_enable(char *str) +void __init hash_pointers_finalize(bool slub_debug) { - if (no_hash_pointers) - return 0; + switch (hash_pointers_mode) { + case HASH_PTR_ALWAYS: + no_hash_pointers = false; + break; + case HASH_PTR_NEVER: + no_hash_pointers = true; + break; + case HASH_PTR_AUTO: + default: + no_hash_pointers = slub_debug; + break; + } - no_hash_pointers = true; + if (!no_hash_pointers) + return; pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); @@ -2307,11 +2331,39 @@ int __init no_hash_pointers_enable(char *str) pr_warn("** the kernel, report this immediately to your system **\n"); pr_warn("** administrator! **\n"); pr_warn("** **\n"); + pr_warn("** Use hash_pointers=always to force this mode off **\n"); + pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); +} + +static int __init hash_pointers_mode_parse(char *str) +{ + if (!str) { + pr_warn("Hash pointers mode empty; falling back to auto.\n"); + hash_pointers_mode = HASH_PTR_AUTO; + } else if (strncmp(str, "auto", 4) == 0) { + pr_info("Hash pointers mode set to auto.\n"); + hash_pointers_mode = HASH_PTR_AUTO; + } else if (strncmp(str, "never", 5) == 0) { + pr_info("Hash pointers mode set to never.\n"); + hash_pointers_mode = HASH_PTR_NEVER; + } else if (strncmp(str, "always", 6) == 0) { + pr_info("Hash pointers mode set to always.\n"); + hash_pointers_mode = HASH_PTR_ALWAYS; + } else { + pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str); + hash_pointers_mode = HASH_PTR_AUTO; + } return 0; } +early_param("hash_pointers", hash_pointers_mode_parse); + +static int __init no_hash_pointers_enable(char *str) +{ + return hash_pointers_mode_parse("never"); +} early_param("no_hash_pointers", no_hash_pointers_enable); /* diff --git a/lib/xxhash.c b/lib/xxhash.c index b5bd567aa6b3..cf629766f376 100644 --- a/lib/xxhash.c +++ b/lib/xxhash.c @@ -267,113 +267,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) } EXPORT_SYMBOL(xxh64_reset); -int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) -{ - const uint8_t *p = (const uint8_t *)input; - const uint8_t *const b_end = p + len; - - if (input == NULL) - return -EINVAL; - - state->total_len_32 += (uint32_t)len; - state->large_len |= (len >= 16) | (state->total_len_32 >= 16); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); - state->memsize += (uint32_t)len; - return 0; - } - - if (state->memsize) { /* some data left from previous update */ - const uint32_t *p32 = state->mem32; - - memcpy((uint8_t *)(state->mem32) + state->memsize, input, - 16 - state->memsize); - - state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); - p32++; - state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); - p32++; - state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); - p32++; - state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); - p32++; - - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= b_end - 16) { - const uint8_t *const limit = b_end - 16; - uint32_t v1 = state->v1; - uint32_t v2 = state->v2; - uint32_t v3 = state->v3; - uint32_t v4 = state->v4; - - do { - v1 = xxh32_round(v1, get_unaligned_le32(p)); - p += 4; - v2 = xxh32_round(v2, get_unaligned_le32(p)); - p += 4; - v3 = xxh32_round(v3, get_unaligned_le32(p)); - p += 4; - v4 = xxh32_round(v4, get_unaligned_le32(p)); - p += 4; - } while (p <= limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < b_end) { - memcpy(state->mem32, p, (size_t)(b_end-p)); - state->memsize = (uint32_t)(b_end-p); - } - - return 0; -} -EXPORT_SYMBOL(xxh32_update); - -uint32_t xxh32_digest(const struct xxh32_state *state) -{ - const uint8_t *p = (const uint8_t *)state->mem32; - const uint8_t *const b_end = (const uint8_t *)(state->mem32) + - state->memsize; - uint32_t h32; - - if (state->large_len) { - h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + - xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); - } else { - h32 = state->v3 /* == seed */ + PRIME32_5; - } - - h32 += state->total_len_32; - - while (p + 4 <= b_end) { - h32 += get_unaligned_le32(p) * PRIME32_3; - h32 = xxh_rotl32(h32, 17) * PRIME32_4; - p += 4; - } - - while (p < b_end) { - h32 += (*p) * PRIME32_5; - h32 = xxh_rotl32(h32, 11) * PRIME32_1; - p++; - } - - h32 ^= h32 >> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} -EXPORT_SYMBOL(xxh32_digest); - int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; diff --git a/mm/slub.c b/mm/slub.c index cf7c6032d5fd..30003763d224 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6312,9 +6312,8 @@ void __init kmem_cache_init(void) if (debug_guardpage_minorder()) slub_max_order = 0; - /* Print slub debugging pointers without hashing */ - if (__slub_debug_enabled()) - no_hash_pointers_enable(NULL); + /* Inform pointer hashing choice about slub debugging state. */ + hash_pointers_finalize(__slub_debug_enabled()); kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; diff --git a/rust/Makefile b/rust/Makefile index 115b63b7d1e3..4263462b8470 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -34,6 +34,9 @@ obj-$(CONFIG_RUST_KERNEL_DOCTESTS) += doctests_kernel_generated.o obj-$(CONFIG_RUST_KERNEL_DOCTESTS) += doctests_kernel_generated_kunit.o always-$(subst y,$(CONFIG_RUST),$(CONFIG_JUMP_LABEL)) += kernel/generated_arch_static_branch_asm.rs +ifndef CONFIG_UML +always-$(subst y,$(CONFIG_RUST),$(CONFIG_BUG)) += kernel/generated_arch_warn_asm.rs kernel/generated_arch_reachable_asm.rs +endif # Avoids running `$(RUSTC)` when it may not be available. ifdef CONFIG_RUST @@ -541,5 +544,10 @@ $(obj)/kernel.o: $(src)/kernel/lib.rs $(obj)/build_error.o $(obj)/pin_init.o \ ifdef CONFIG_JUMP_LABEL $(obj)/kernel.o: $(obj)/kernel/generated_arch_static_branch_asm.rs endif +ifndef CONFIG_UML +ifdef CONFIG_BUG +$(obj)/kernel.o: $(obj)/kernel/generated_arch_warn_asm.rs $(obj)/kernel/generated_arch_reachable_asm.rs +endif +endif endif # CONFIG_RUST diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index a08eb5518cac..474cc98c48a3 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -25,6 +25,9 @@ )] #[allow(dead_code)] +#[allow(clippy::cast_lossless)] +#[allow(clippy::ptr_as_ptr)] +#[allow(clippy::ref_as_ptr)] #[allow(clippy::undocumented_unsafe_blocks)] #[cfg_attr(CONFIG_RUSTC_HAS_UNNECESSARY_TRANSMUTES, allow(unnecessary_transmutes))] mod bindings_raw { diff --git a/rust/helpers/bug.c b/rust/helpers/bug.c index e2d13babc737..a62c96f507d1 100644 --- a/rust/helpers/bug.c +++ b/rust/helpers/bug.c @@ -6,3 +6,8 @@ __noreturn void rust_helper_BUG(void) { BUG(); } + +bool rust_helper_WARN_ON(bool cond) +{ + return WARN_ON(cond); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 2bb13285825b..7cf7fe95e41d 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -30,21 +30,22 @@ #include "mutex.c" #include "of.c" #include "page.c" -#include "platform.c" #include "pci.c" #include "pid_namespace.c" +#include "platform.c" #include "poll.c" #include "property.c" #include "rbtree.c" -#include "regulator.c" #include "rcu.c" #include "refcount.c" +#include "regulator.c" #include "security.c" #include "signal.c" #include "slab.c" #include "spinlock.c" #include "sync.c" #include "task.c" +#include "time.c" #include "uaccess.c" #include "vmalloc.c" #include "wait.c" diff --git a/rust/helpers/time.c b/rust/helpers/time.c new file mode 100644 index 000000000000..a318e9fa4408 --- /dev/null +++ b/rust/helpers/time.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/timekeeping.h> + +void rust_helper_fsleep(unsigned long usecs) +{ + fsleep(usecs); +} + +ktime_t rust_helper_ktime_get_real(void) +{ + return ktime_get_real(); +} + +ktime_t rust_helper_ktime_get_boottime(void) +{ + return ktime_get_boottime(); +} + +ktime_t rust_helper_ktime_get_clocktai(void) +{ + return ktime_get_clocktai(); +} + +s64 rust_helper_ktime_to_us(const ktime_t kt) +{ + return ktime_to_us(kt); +} + +s64 rust_helper_ktime_to_ms(const ktime_t kt) +{ + return ktime_to_ms(kt); +} diff --git a/rust/kernel/.gitignore b/rust/kernel/.gitignore index 6ba39a178f30..f636ad95aaf3 100644 --- a/rust/kernel/.gitignore +++ b/rust/kernel/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 /generated_arch_static_branch_asm.rs +/generated_arch_warn_asm.rs +/generated_arch_reachable_asm.rs diff --git a/rust/kernel/alloc/allocator_test.rs b/rust/kernel/alloc/allocator_test.rs index d19c06ef0498..a3074480bd8d 100644 --- a/rust/kernel/alloc/allocator_test.rs +++ b/rust/kernel/alloc/allocator_test.rs @@ -82,7 +82,7 @@ unsafe impl Allocator for Cmalloc { // SAFETY: Returns either NULL or a pointer to a memory allocation that satisfies or // exceeds the given size and alignment requirements. - let dst = unsafe { libc_aligned_alloc(layout.align(), layout.size()) } as *mut u8; + let dst = unsafe { libc_aligned_alloc(layout.align(), layout.size()) }.cast::<u8>(); let dst = NonNull::new(dst).ok_or(AllocError)?; if flags.contains(__GFP_ZERO) { diff --git a/rust/kernel/alloc/kbox.rs b/rust/kernel/alloc/kbox.rs index c386ff771d50..856d05aa60f1 100644 --- a/rust/kernel/alloc/kbox.rs +++ b/rust/kernel/alloc/kbox.rs @@ -6,6 +6,7 @@ use super::allocator::{KVmalloc, Kmalloc, Vmalloc}; use super::{AllocError, Allocator, Flags}; use core::alloc::Layout; +use core::borrow::{Borrow, BorrowMut}; use core::fmt; use core::marker::PhantomData; use core::mem::ManuallyDrop; @@ -15,6 +16,7 @@ use core::pin::Pin; use core::ptr::NonNull; use core::result::Result; +use crate::ffi::c_void; use crate::init::InPlaceInit; use crate::types::ForeignOwnable; use pin_init::{InPlaceWrite, Init, PinInit, ZeroableOption}; @@ -398,70 +400,74 @@ where } } -// SAFETY: The `into_foreign` function returns a pointer that is well-aligned. +// SAFETY: The pointer returned by `into_foreign` comes from a well aligned +// pointer to `T`. unsafe impl<T: 'static, A> ForeignOwnable for Box<T, A> where A: Allocator, { - type PointedTo = T; + const FOREIGN_ALIGN: usize = core::mem::align_of::<T>(); type Borrowed<'a> = &'a T; type BorrowedMut<'a> = &'a mut T; - fn into_foreign(self) -> *mut Self::PointedTo { - Box::into_raw(self) + fn into_foreign(self) -> *mut c_void { + Box::into_raw(self).cast() } - unsafe fn from_foreign(ptr: *mut Self::PointedTo) -> Self { + unsafe fn from_foreign(ptr: *mut c_void) -> Self { // SAFETY: The safety requirements of this function ensure that `ptr` comes from a previous // call to `Self::into_foreign`. - unsafe { Box::from_raw(ptr) } + unsafe { Box::from_raw(ptr.cast()) } } - unsafe fn borrow<'a>(ptr: *mut Self::PointedTo) -> &'a T { + unsafe fn borrow<'a>(ptr: *mut c_void) -> &'a T { // SAFETY: The safety requirements of this method ensure that the object remains alive and // immutable for the duration of 'a. - unsafe { &*ptr } + unsafe { &*ptr.cast() } } - unsafe fn borrow_mut<'a>(ptr: *mut Self::PointedTo) -> &'a mut T { + unsafe fn borrow_mut<'a>(ptr: *mut c_void) -> &'a mut T { + let ptr = ptr.cast(); // SAFETY: The safety requirements of this method ensure that the pointer is valid and that // nothing else will access the value for the duration of 'a. unsafe { &mut *ptr } } } -// SAFETY: The `into_foreign` function returns a pointer that is well-aligned. +// SAFETY: The pointer returned by `into_foreign` comes from a well aligned +// pointer to `T`. unsafe impl<T: 'static, A> ForeignOwnable for Pin<Box<T, A>> where A: Allocator, { - type PointedTo = T; + const FOREIGN_ALIGN: usize = core::mem::align_of::<T>(); type Borrowed<'a> = Pin<&'a T>; type BorrowedMut<'a> = Pin<&'a mut T>; - fn into_foreign(self) -> *mut Self::PointedTo { + fn into_foreign(self) -> *mut c_void { // SAFETY: We are still treating the box as pinned. - Box::into_raw(unsafe { Pin::into_inner_unchecked(self) }) + Box::into_raw(unsafe { Pin::into_inner_unchecked(self) }).cast() } - unsafe fn from_foreign(ptr: *mut Self::PointedTo) -> Self { + unsafe fn from_foreign(ptr: *mut c_void) -> Self { // SAFETY: The safety requirements of this function ensure that `ptr` comes from a previous // call to `Self::into_foreign`. - unsafe { Pin::new_unchecked(Box::from_raw(ptr)) } + unsafe { Pin::new_unchecked(Box::from_raw(ptr.cast())) } } - unsafe fn borrow<'a>(ptr: *mut Self::PointedTo) -> Pin<&'a T> { + unsafe fn borrow<'a>(ptr: *mut c_void) -> Pin<&'a T> { // SAFETY: The safety requirements for this function ensure that the object is still alive, // so it is safe to dereference the raw pointer. // The safety requirements of `from_foreign` also ensure that the object remains alive for // the lifetime of the returned value. - let r = unsafe { &*ptr }; + let r = unsafe { &*ptr.cast() }; // SAFETY: This pointer originates from a `Pin<Box<T>>`. unsafe { Pin::new_unchecked(r) } } - unsafe fn borrow_mut<'a>(ptr: *mut Self::PointedTo) -> Pin<&'a mut T> { + unsafe fn borrow_mut<'a>(ptr: *mut c_void) -> Pin<&'a mut T> { + let ptr = ptr.cast(); // SAFETY: The safety requirements for this function ensure that the object is still alive, // so it is safe to dereference the raw pointer. // The safety requirements of `from_foreign` also ensure that the object remains alive for @@ -499,6 +505,62 @@ where } } +/// # Examples +/// +/// ``` +/// # use core::borrow::Borrow; +/// # use kernel::alloc::KBox; +/// struct Foo<B: Borrow<u32>>(B); +/// +/// // Owned instance. +/// let owned = Foo(1); +/// +/// // Owned instance using `KBox`. +/// let owned_kbox = Foo(KBox::new(1, GFP_KERNEL)?); +/// +/// let i = 1; +/// // Borrowed from `i`. +/// let borrowed = Foo(&i); +/// # Ok::<(), Error>(()) +/// ``` +impl<T, A> Borrow<T> for Box<T, A> +where + T: ?Sized, + A: Allocator, +{ + fn borrow(&self) -> &T { + self.deref() + } +} + +/// # Examples +/// +/// ``` +/// # use core::borrow::BorrowMut; +/// # use kernel::alloc::KBox; +/// struct Foo<B: BorrowMut<u32>>(B); +/// +/// // Owned instance. +/// let owned = Foo(1); +/// +/// // Owned instance using `KBox`. +/// let owned_kbox = Foo(KBox::new(1, GFP_KERNEL)?); +/// +/// let mut i = 1; +/// // Borrowed from `i`. +/// let borrowed = Foo(&mut i); +/// # Ok::<(), Error>(()) +/// ``` +impl<T, A> BorrowMut<T> for Box<T, A> +where + T: ?Sized, + A: Allocator, +{ + fn borrow_mut(&mut self) -> &mut T { + self.deref_mut() + } +} + impl<T, A> fmt::Display for Box<T, A> where T: ?Sized + fmt::Display, diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs index 1a0dd852a468..3c72e0bdddb8 100644 --- a/rust/kernel/alloc/kvec.rs +++ b/rust/kernel/alloc/kvec.rs @@ -8,6 +8,7 @@ use super::{ AllocError, Allocator, Box, Flags, }; use core::{ + borrow::{Borrow, BorrowMut}, fmt, marker::PhantomData, mem::{ManuallyDrop, MaybeUninit}, @@ -288,7 +289,7 @@ where // - `self.len` is smaller than `self.capacity` by the type invariant and hence, the // resulting pointer is guaranteed to be part of the same allocated object. // - `self.len` can not overflow `isize`. - let ptr = unsafe { self.as_mut_ptr().add(self.len) } as *mut MaybeUninit<T>; + let ptr = unsafe { self.as_mut_ptr().add(self.len) }.cast::<MaybeUninit<T>>(); // SAFETY: The memory between `self.len` and `self.capacity` is guaranteed to be allocated // and valid, but uninitialized. @@ -847,11 +848,11 @@ where // - `ptr` points to memory with at least a size of `size_of::<T>() * len`, // - all elements within `b` are initialized values of `T`, // - `len` does not exceed `isize::MAX`. - unsafe { Vec::from_raw_parts(ptr as _, len, len) } + unsafe { Vec::from_raw_parts(ptr.cast(), len, len) } } } -impl<T> Default for KVec<T> { +impl<T, A: Allocator> Default for Vec<T, A> { #[inline] fn default() -> Self { Self::new() @@ -890,6 +891,58 @@ where } } +/// # Examples +/// +/// ``` +/// # use core::borrow::Borrow; +/// struct Foo<B: Borrow<[u32]>>(B); +/// +/// // Owned array. +/// let owned_array = Foo([1, 2, 3]); +/// +/// // Owned vector. +/// let owned_vec = Foo(KVec::from_elem(0, 3, GFP_KERNEL)?); +/// +/// let arr = [1, 2, 3]; +/// // Borrowed slice from `arr`. +/// let borrowed_slice = Foo(&arr[..]); +/// # Ok::<(), Error>(()) +/// ``` +impl<T, A> Borrow<[T]> for Vec<T, A> +where + A: Allocator, +{ + fn borrow(&self) -> &[T] { + self.as_slice() + } +} + +/// # Examples +/// +/// ``` +/// # use core::borrow::BorrowMut; +/// struct Foo<B: BorrowMut<[u32]>>(B); +/// +/// // Owned array. +/// let owned_array = Foo([1, 2, 3]); +/// +/// // Owned vector. +/// let owned_vec = Foo(KVec::from_elem(0, 3, GFP_KERNEL)?); +/// +/// let mut arr = [1, 2, 3]; +/// // Borrowed slice from `arr`. +/// let borrowed_slice = Foo(&mut arr[..]); +/// # Ok::<(), Error>(()) +/// ``` +impl<T, A> BorrowMut<[T]> for Vec<T, A> +where + A: Allocator, +{ + fn borrow_mut(&mut self) -> &mut [T] { + self.as_mut_slice() + } +} + impl<T: Eq, A> Eq for Vec<T, A> where A: Allocator {} impl<T, I: SliceIndex<[T]>, A> Index<I> for Vec<T, A> diff --git a/rust/kernel/bits.rs b/rust/kernel/bits.rs new file mode 100644 index 000000000000..553d50265883 --- /dev/null +++ b/rust/kernel/bits.rs @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Bit manipulation macros. +//! +//! C header: [`include/linux/bits.h`](srctree/include/linux/bits.h) + +use crate::prelude::*; +use core::ops::RangeInclusive; +use macros::paste; + +macro_rules! impl_bit_fn { + ( + $ty:ty + ) => { + paste! { + /// Computes `1 << n` if `n` is in bounds, i.e.: if `n` is smaller than + /// the maximum number of bits supported by the type. + /// + /// Returns [`None`] otherwise. + #[inline] + pub fn [<checked_bit_ $ty>](n: u32) -> Option<$ty> { + (1 as $ty).checked_shl(n) + } + + /// Computes `1 << n` by performing a compile-time assertion that `n` is + /// in bounds. + /// + /// This version is the default and should be used if `n` is known at + /// compile time. + #[inline] + pub const fn [<bit_ $ty>](n: u32) -> $ty { + build_assert!(n < <$ty>::BITS); + (1 as $ty) << n + } + } + }; +} + +impl_bit_fn!(u64); +impl_bit_fn!(u32); +impl_bit_fn!(u16); +impl_bit_fn!(u8); + +macro_rules! impl_genmask_fn { + ( + $ty:ty, + $(#[$genmask_checked_ex:meta])*, + $(#[$genmask_ex:meta])* + ) => { + paste! { + /// Creates a contiguous bitmask for the given range by validating + /// the range at runtime. + /// + /// Returns [`None`] if the range is invalid, i.e.: if the start is + /// greater than the end or if the range is outside of the + /// representable range for the type. + $(#[$genmask_checked_ex])* + #[inline] + pub fn [<genmask_checked_ $ty>](range: RangeInclusive<u32>) -> Option<$ty> { + let start = *range.start(); + let end = *range.end(); + + if start > end { + return None; + } + + let high = [<checked_bit_ $ty>](end)?; + let low = [<checked_bit_ $ty>](start)?; + Some((high | (high - 1)) & !(low - 1)) + } + + /// Creates a compile-time contiguous bitmask for the given range by + /// performing a compile-time assertion that the range is valid. + /// + /// This version is the default and should be used if the range is known + /// at compile time. + $(#[$genmask_ex])* + #[inline] + pub const fn [<genmask_ $ty>](range: RangeInclusive<u32>) -> $ty { + let start = *range.start(); + let end = *range.end(); + + build_assert!(start <= end); + + let high = [<bit_ $ty>](end); + let low = [<bit_ $ty>](start); + (high | (high - 1)) & !(low - 1) + } + } + }; +} + +impl_genmask_fn!( + u64, + /// # Examples + /// + /// ``` + /// # #![expect(clippy::reversed_empty_ranges)] + /// # use kernel::bits::genmask_checked_u64; + /// assert_eq!(genmask_checked_u64(0..=0), Some(0b1)); + /// assert_eq!(genmask_checked_u64(0..=63), Some(u64::MAX)); + /// assert_eq!(genmask_checked_u64(21..=39), Some(0x0000_00ff_ffe0_0000)); + /// + /// // `80` is out of the supported bit range. + /// assert_eq!(genmask_checked_u64(21..=80), None); + /// + /// // Invalid range where the start is bigger than the end. + /// assert_eq!(genmask_checked_u64(15..=8), None); + /// ``` + , + /// # Examples + /// + /// ``` + /// # use kernel::bits::genmask_u64; + /// assert_eq!(genmask_u64(21..=39), 0x0000_00ff_ffe0_0000); + /// assert_eq!(genmask_u64(0..=0), 0b1); + /// assert_eq!(genmask_u64(0..=63), u64::MAX); + /// ``` +); + +impl_genmask_fn!( + u32, + /// # Examples + /// + /// ``` + /// # #![expect(clippy::reversed_empty_ranges)] + /// # use kernel::bits::genmask_checked_u32; + /// assert_eq!(genmask_checked_u32(0..=0), Some(0b1)); + /// assert_eq!(genmask_checked_u32(0..=31), Some(u32::MAX)); + /// assert_eq!(genmask_checked_u32(21..=31), Some(0xffe0_0000)); + /// + /// // `40` is out of the supported bit range. + /// assert_eq!(genmask_checked_u32(21..=40), None); + /// + /// // Invalid range where the start is bigger than the end. + /// assert_eq!(genmask_checked_u32(15..=8), None); + /// ``` + , + /// # Examples + /// + /// ``` + /// # use kernel::bits::genmask_u32; + /// assert_eq!(genmask_u32(21..=31), 0xffe0_0000); + /// assert_eq!(genmask_u32(0..=0), 0b1); + /// assert_eq!(genmask_u32(0..=31), u32::MAX); + /// ``` +); + +impl_genmask_fn!( + u16, + /// # Examples + /// + /// ``` + /// # #![expect(clippy::reversed_empty_ranges)] + /// # use kernel::bits::genmask_checked_u16; + /// assert_eq!(genmask_checked_u16(0..=0), Some(0b1)); + /// assert_eq!(genmask_checked_u16(0..=15), Some(u16::MAX)); + /// assert_eq!(genmask_checked_u16(6..=15), Some(0xffc0)); + /// + /// // `20` is out of the supported bit range. + /// assert_eq!(genmask_checked_u16(6..=20), None); + /// + /// // Invalid range where the start is bigger than the end. + /// assert_eq!(genmask_checked_u16(10..=5), None); + /// ``` + , + /// # Examples + /// + /// ``` + /// # use kernel::bits::genmask_u16; + /// assert_eq!(genmask_u16(6..=15), 0xffc0); + /// assert_eq!(genmask_u16(0..=0), 0b1); + /// assert_eq!(genmask_u16(0..=15), u16::MAX); + /// ``` +); + +impl_genmask_fn!( + u8, + /// # Examples + /// + /// ``` + /// # #![expect(clippy::reversed_empty_ranges)] + /// # use kernel::bits::genmask_checked_u8; + /// assert_eq!(genmask_checked_u8(0..=0), Some(0b1)); + /// assert_eq!(genmask_checked_u8(0..=7), Some(u8::MAX)); + /// assert_eq!(genmask_checked_u8(6..=7), Some(0xc0)); + /// + /// // `10` is out of the supported bit range. + /// assert_eq!(genmask_checked_u8(6..=10), None); + /// + /// // Invalid range where the start is bigger than the end. + /// assert_eq!(genmask_checked_u8(5..=2), None); + /// ``` + , + /// # Examples + /// + /// ``` + /// # use kernel::bits::genmask_u8; + /// assert_eq!(genmask_u8(6..=7), 0xc0); + /// assert_eq!(genmask_u8(0..=0), 0b1); + /// assert_eq!(genmask_u8(0..=7), u8::MAX); + /// ``` +); diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index fb0f393c1cea..831445d37181 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -53,7 +53,7 @@ //! [`GenDiskBuilder`]: gen_disk::GenDiskBuilder //! [`GenDiskBuilder::build`]: gen_disk::GenDiskBuilder::build //! -//! # Example +//! # Examples //! //! ```rust //! use kernel::{ diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index 864ff379dc91..c2b98f507bcb 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -101,7 +101,7 @@ impl<T: Operations> OperationsVTable<T> { if let Err(e) = ret { e.to_blk_status() } else { - bindings::BLK_STS_OK as _ + bindings::BLK_STS_OK as bindings::blk_status_t } } diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index 4a5b7ec914ef..fefd394f064a 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -69,7 +69,7 @@ impl<T: Operations> Request<T> { // INVARIANT: By the safety requirements of this function, invariants are upheld. // SAFETY: By the safety requirement of this function, we own a // reference count that we can pass to `ARef`. - unsafe { ARef::from_raw(NonNull::new_unchecked(ptr as *const Self as *mut Self)) } + unsafe { ARef::from_raw(NonNull::new_unchecked(ptr.cast())) } } /// Notify the block layer that a request is going to be processed now. @@ -125,7 +125,12 @@ impl<T: Operations> Request<T> { // success of the call to `try_set_end` guarantees that there are no // `ARef`s pointing to this request. Therefore it is safe to hand it // back to the block layer. - unsafe { bindings::blk_mq_end_request(request_ptr, bindings::BLK_STS_OK as _) }; + unsafe { + bindings::blk_mq_end_request( + request_ptr, + bindings::BLK_STS_OK as bindings::blk_status_t, + ) + }; Ok(()) } @@ -155,7 +160,7 @@ impl<T: Operations> Request<T> { // the private data associated with this request is initialized and // valid. The existence of `&self` guarantees that the private data is // valid as a shared reference. - unsafe { Self::wrapper_ptr(self as *const Self as *mut Self).as_ref() } + unsafe { Self::wrapper_ptr(core::ptr::from_ref(self).cast_mut()).as_ref() } } } diff --git a/rust/kernel/bug.rs b/rust/kernel/bug.rs new file mode 100644 index 000000000000..36aef43e5ebe --- /dev/null +++ b/rust/kernel/bug.rs @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024, 2025 FUJITA Tomonori <fujita.tomonori@gmail.com> + +//! Support for BUG and WARN functionality. +//! +//! C header: [`include/asm-generic/bug.h`](srctree/include/asm-generic/bug.h) + +#[macro_export] +#[doc(hidden)] +#[cfg(all(CONFIG_BUG, not(CONFIG_UML), not(CONFIG_LOONGARCH), not(CONFIG_ARM)))] +#[cfg(CONFIG_DEBUG_BUGVERBOSE)] +macro_rules! warn_flags { + ($flags:expr) => { + const FLAGS: u32 = $crate::bindings::BUGFLAG_WARNING | $flags; + const _FILE: &[u8] = file!().as_bytes(); + // Plus one for null-terminator. + static FILE: [u8; _FILE.len() + 1] = { + let mut bytes = [0; _FILE.len() + 1]; + let mut i = 0; + while i < _FILE.len() { + bytes[i] = _FILE[i]; + i += 1; + } + bytes + }; + + // SAFETY: + // - `file`, `line`, `flags`, and `size` are all compile-time constants or + // symbols, preventing any invalid memory access. + // - The asm block has no side effects and does not modify any registers + // or memory. It is purely for embedding metadata into the ELF section. + unsafe { + $crate::asm!( + concat!( + "/* {size} */", + include!(concat!(env!("OBJTREE"), "/rust/kernel/generated_arch_warn_asm.rs")), + include!(concat!(env!("OBJTREE"), "/rust/kernel/generated_arch_reachable_asm.rs"))); + file = sym FILE, + line = const line!(), + flags = const FLAGS, + size = const ::core::mem::size_of::<$crate::bindings::bug_entry>(), + ); + } + } +} + +#[macro_export] +#[doc(hidden)] +#[cfg(all(CONFIG_BUG, not(CONFIG_UML), not(CONFIG_LOONGARCH), not(CONFIG_ARM)))] +#[cfg(not(CONFIG_DEBUG_BUGVERBOSE))] +macro_rules! warn_flags { + ($flags:expr) => { + const FLAGS: u32 = $crate::bindings::BUGFLAG_WARNING | $flags; + + // SAFETY: + // - `flags` and `size` are all compile-time constants, preventing + // any invalid memory access. + // - The asm block has no side effects and does not modify any registers + // or memory. It is purely for embedding metadata into the ELF section. + unsafe { + $crate::asm!( + concat!( + "/* {size} */", + include!(concat!(env!("OBJTREE"), "/rust/kernel/generated_arch_warn_asm.rs")), + include!(concat!(env!("OBJTREE"), "/rust/kernel/generated_arch_reachable_asm.rs"))); + flags = const FLAGS, + size = const ::core::mem::size_of::<$crate::bindings::bug_entry>(), + ); + } + } +} + +#[macro_export] +#[doc(hidden)] +#[cfg(all(CONFIG_BUG, CONFIG_UML))] +macro_rules! warn_flags { + ($flags:expr) => { + // SAFETY: It is always safe to call `warn_slowpath_fmt()` + // with a valid null-terminated string. + unsafe { + $crate::bindings::warn_slowpath_fmt( + $crate::c_str!(::core::file!()).as_char_ptr(), + line!() as $crate::ffi::c_int, + $flags as $crate::ffi::c_uint, + ::core::ptr::null(), + ); + } + }; +} + +#[macro_export] +#[doc(hidden)] +#[cfg(all(CONFIG_BUG, any(CONFIG_LOONGARCH, CONFIG_ARM)))] +macro_rules! warn_flags { + ($flags:expr) => { + // SAFETY: It is always safe to call `WARN_ON()`. + unsafe { $crate::bindings::WARN_ON(true) } + }; +} + +#[macro_export] +#[doc(hidden)] +#[cfg(not(CONFIG_BUG))] +macro_rules! warn_flags { + ($flags:expr) => {}; +} + +#[doc(hidden)] +pub const fn bugflag_taint(value: u32) -> u32 { + value << 8 +} + +/// Report a warning if `cond` is true and return the condition's evaluation result. +#[macro_export] +macro_rules! warn_on { + ($cond:expr) => {{ + let cond = $cond; + if cond { + const WARN_ON_FLAGS: u32 = $crate::bug::bugflag_taint($crate::bindings::TAINT_WARN); + + $crate::warn_flags!(WARN_ON_FLAGS); + } + cond + }}; +} diff --git a/rust/kernel/clk.rs b/rust/kernel/clk.rs index fbcea31dbcca..1e6c8c42fb3a 100644 --- a/rust/kernel/clk.rs +++ b/rust/kernel/clk.rs @@ -12,7 +12,7 @@ use crate::ffi::c_ulong; /// /// Represents a frequency in hertz, wrapping a [`c_ulong`] value. /// -/// ## Examples +/// # Examples /// /// ``` /// use kernel::clk::Hertz; @@ -99,7 +99,7 @@ mod common_clk { /// Instances of this type are reference-counted. Calling [`Clk::get`] ensures that the /// allocation remains valid for the lifetime of the [`Clk`]. /// - /// ## Examples + /// # Examples /// /// The following example demonstrates how to obtain and configure a clock for a device. /// @@ -266,7 +266,7 @@ mod common_clk { /// Instances of this type are reference-counted. Calling [`OptionalClk::get`] ensures that the /// allocation remains valid for the lifetime of the [`OptionalClk`]. /// - /// ## Examples + /// # Examples /// /// The following example demonstrates how to obtain and configure an optional clock for a /// device. The code functions correctly whether or not the clock is available. diff --git a/rust/kernel/configfs.rs b/rust/kernel/configfs.rs index 34d0bea4f9a5..2736b798cdc6 100644 --- a/rust/kernel/configfs.rs +++ b/rust/kernel/configfs.rs @@ -17,7 +17,7 @@ //! //! C header: [`include/linux/configfs.h`](srctree/include/linux/configfs.h) //! -//! # Example +//! # Examples //! //! ```ignore //! use kernel::alloc::flags; @@ -151,7 +151,7 @@ impl<Data> Subsystem<Data> { data: impl PinInit<Data, Error>, ) -> impl PinInit<Self, Error> { try_pin_init!(Self { - subsystem <- pin_init::zeroed().chain( + subsystem <- pin_init::init_zeroed().chain( |place: &mut Opaque<bindings::configfs_subsystem>| { // SAFETY: We initialized the required fields of `place.group` above. unsafe { @@ -261,7 +261,7 @@ impl<Data> Group<Data> { data: impl PinInit<Data, Error>, ) -> impl PinInit<Self, Error> { try_pin_init!(Self { - group <- pin_init::zeroed().chain(|v: &mut Opaque<bindings::config_group>| { + group <- pin_init::init_zeroed().chain(|v: &mut Opaque<bindings::config_group>| { let place = v.get(); let name = name.as_bytes_with_nul().as_ptr(); // SAFETY: It is safe to initialize a group once it has been zeroed. @@ -279,7 +279,7 @@ impl<Data> Group<Data> { // within the `group` field. unsafe impl<Data> HasGroup<Data> for Group<Data> { unsafe fn group(this: *const Self) -> *const bindings::config_group { - Opaque::raw_get( + Opaque::cast_into( // SAFETY: By impl and function safety requirements this field // projection is within bounds of the allocation. unsafe { &raw const (*this).group }, @@ -426,7 +426,7 @@ where }; const fn vtable_ptr() -> *const bindings::configfs_group_operations { - &Self::VTABLE as *const bindings::configfs_group_operations + &Self::VTABLE } } @@ -464,7 +464,7 @@ where }; const fn vtable_ptr() -> *const bindings::configfs_item_operations { - &Self::VTABLE as *const bindings::configfs_item_operations + &Self::VTABLE } } @@ -476,7 +476,7 @@ impl<Data> ItemOperationsVTable<Subsystem<Data>, Data> { }; const fn vtable_ptr() -> *const bindings::configfs_item_operations { - &Self::VTABLE as *const bindings::configfs_item_operations + &Self::VTABLE } } @@ -561,7 +561,7 @@ where let data: &Data = unsafe { get_group_data(c_group) }; // SAFETY: By function safety requirements, `page` is writable for `PAGE_SIZE`. - let ret = O::show(data, unsafe { &mut *(page as *mut [u8; PAGE_SIZE]) }); + let ret = O::show(data, unsafe { &mut *(page.cast::<[u8; PAGE_SIZE]>()) }); match ret { Ok(size) => size as isize, @@ -717,11 +717,7 @@ impl<const N: usize, Data> AttributeList<N, Data> { // SAFETY: By function safety requirements, we have exclusive access to // `self` and the reference created below will be exclusive. - unsafe { - (&mut *self.0.get())[I] = (attribute as *const Attribute<ID, O, Data>) - .cast_mut() - .cast() - }; + unsafe { (&mut *self.0.get())[I] = core::ptr::from_ref(attribute).cast_mut().cast() }; } } @@ -761,9 +757,7 @@ macro_rules! impl_item_type { ct_owner: owner.as_ptr(), ct_group_ops: GroupOperationsVTable::<Data, Child>::vtable_ptr().cast_mut(), ct_item_ops: ItemOperationsVTable::<$tpe, Data>::vtable_ptr().cast_mut(), - ct_attrs: (attributes as *const AttributeList<N, Data>) - .cast_mut() - .cast(), + ct_attrs: core::ptr::from_ref(attributes).cast_mut().cast(), ct_bin_attrs: core::ptr::null_mut(), }), _p: PhantomData, @@ -780,9 +774,7 @@ macro_rules! impl_item_type { ct_owner: owner.as_ptr(), ct_group_ops: core::ptr::null_mut(), ct_item_ops: ItemOperationsVTable::<$tpe, Data>::vtable_ptr().cast_mut(), - ct_attrs: (attributes as *const AttributeList<N, Data>) - .cast_mut() - .cast(), + ct_attrs: core::ptr::from_ref(attributes).cast_mut().cast(), ct_bin_attrs: core::ptr::null_mut(), }), _p: PhantomData, diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index d0ea24236ae4..afc15e72a7c3 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -202,7 +202,7 @@ impl From<TableIndex> for usize { /// The callers must ensure that the `struct cpufreq_frequency_table` is valid for access and /// remains valid for the lifetime of the returned reference. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to read a frequency value from [`Table`]. /// @@ -318,7 +318,7 @@ impl Deref for TableBox { /// /// This is used by the CPU frequency drivers to build a frequency table dynamically. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to create a CPU frequency table. /// @@ -395,7 +395,7 @@ impl TableBuilder { /// The callers must ensure that the `struct cpufreq_policy` is valid for access and remains valid /// for the lifetime of the returned reference. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to create a CPU frequency table. /// @@ -649,7 +649,7 @@ impl Policy { fn set_data<T: ForeignOwnable>(&mut self, data: T) -> Result { if self.as_ref().driver_data.is_null() { // Transfer the ownership of the data to the foreign interface. - self.as_mut_ref().driver_data = <T as ForeignOwnable>::into_foreign(data) as _; + self.as_mut_ref().driver_data = <T as ForeignOwnable>::into_foreign(data).cast(); Ok(()) } else { Err(EBUSY) @@ -834,7 +834,7 @@ pub trait Driver { /// CPU frequency driver Registration. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to register a cpufreq driver. /// diff --git a/rust/kernel/cpumask.rs b/rust/kernel/cpumask.rs index e07f8ff5e3fd..3fcbff438670 100644 --- a/rust/kernel/cpumask.rs +++ b/rust/kernel/cpumask.rs @@ -27,7 +27,7 @@ use core::ops::{Deref, DerefMut}; /// The callers must ensure that the `struct cpumask` is valid for access and /// remains valid for the lifetime of the returned reference. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to update a [`Cpumask`]. /// @@ -172,7 +172,7 @@ impl Cpumask { /// The callers must ensure that the `struct cpumask_var_t` is valid for access and remains valid /// for the lifetime of [`CpumaskVar`]. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to create and update a [`CpumaskVar`]. /// diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index ca82926fd67f..b8613289de8e 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -262,10 +262,10 @@ impl<Ctx: DeviceContext> Device<Ctx> { #[cfg(CONFIG_PRINTK)] unsafe { bindings::_dev_printk( - klevel as *const _ as *const crate::ffi::c_char, + klevel.as_ptr().cast::<crate::ffi::c_char>(), self.as_raw(), c_str!("%pA").as_char_ptr(), - &msg as *const _ as *const crate::ffi::c_void, + core::ptr::from_ref(&msg).cast::<crate::ffi::c_void>(), ) }; } diff --git a/rust/kernel/device_id.rs b/rust/kernel/device_id.rs index 8ed2c946144c..70d57814ff79 100644 --- a/rust/kernel/device_id.rs +++ b/rust/kernel/device_id.rs @@ -100,7 +100,7 @@ impl<T: RawDeviceId, U, const N: usize> IdArray<T, U, N> { unsafe { raw_ids[i] .as_mut_ptr() - .byte_offset(data_offset as _) + .byte_add(data_offset) .cast::<usize>() .write(i); } @@ -177,7 +177,7 @@ impl<T: RawDeviceId, U, const N: usize> IdTable<T, U> for IdArray<T, U, N> { fn as_ptr(&self) -> *const T::RawType { // This cannot be `self.ids.as_ptr()`, as the return pointer must have correct provenance // to access the sentinel. - (self as *const Self).cast() + core::ptr::from_ref(self).cast() } fn id(&self, index: usize) -> &T::RawType { diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs index 152a89b78943..da18091143a6 100644 --- a/rust/kernel/devres.rs +++ b/rust/kernel/devres.rs @@ -49,7 +49,7 @@ struct Inner<T: Send> { /// [`Devres`] users should make sure to simply free the corresponding backing resource in `T`'s /// [`Drop`] implementation. /// -/// # Example +/// # Examples /// /// ```no_run /// # use kernel::{bindings, device::{Bound, Device}, devres::Devres, io::{Io, IoRaw}}; @@ -66,19 +66,19 @@ struct Inner<T: Send> { /// unsafe fn new(paddr: usize) -> Result<Self>{ /// // SAFETY: By the safety requirements of this function [`paddr`, `paddr` + `SIZE`) is /// // valid for `ioremap`. -/// let addr = unsafe { bindings::ioremap(paddr as _, SIZE as _) }; +/// let addr = unsafe { bindings::ioremap(paddr as bindings::phys_addr_t, SIZE) }; /// if addr.is_null() { /// return Err(ENOMEM); /// } /// -/// Ok(IoMem(IoRaw::new(addr as _, SIZE)?)) +/// Ok(IoMem(IoRaw::new(addr as usize, SIZE)?)) /// } /// } /// /// impl<const SIZE: usize> Drop for IoMem<SIZE> { /// fn drop(&mut self) { /// // SAFETY: `self.0.addr()` is guaranteed to be properly mapped by `Self::new`. -/// unsafe { bindings::iounmap(self.0.addr() as _); }; +/// unsafe { bindings::iounmap(self.0.addr() as *mut c_void); }; /// } /// } /// @@ -219,7 +219,7 @@ impl<T: Send> Devres<T> { /// An error is returned if `dev` does not match the same [`Device`] this [`Devres`] instance /// has been created with. /// - /// # Example + /// # Examples /// /// ```no_run /// # #![cfg(CONFIG_PCI)] diff --git a/rust/kernel/dma.rs b/rust/kernel/dma.rs index b320779ea26f..2bc8ab51ec28 100644 --- a/rust/kernel/dma.rs +++ b/rust/kernel/dma.rs @@ -180,7 +180,7 @@ pub struct Attrs(u32); impl Attrs { /// Get the raw representation of this attribute. pub(crate) fn as_raw(self) -> crate::ffi::c_ulong { - self.0 as _ + self.0 as crate::ffi::c_ulong } /// Check whether `flags` is contained in `self`. @@ -333,7 +333,7 @@ impl<T: AsBytes + FromBytes> CoherentAllocation<T> { dev: dev.into(), dma_handle, count, - cpu_addr: ret as *mut T, + cpu_addr: ret.cast::<T>(), dma_attrs, }) } @@ -436,7 +436,7 @@ impl<T: AsBytes + FromBytes> CoherentAllocation<T> { /// slice is live. /// * Callers must ensure that this call does not race with a read or write to the same region /// while the returned slice is live. - pub unsafe fn as_slice_mut(&self, offset: usize, count: usize) -> Result<&mut [T]> { + pub unsafe fn as_slice_mut(&mut self, offset: usize, count: usize) -> Result<&mut [T]> { self.validate_range(offset, count)?; // SAFETY: // - The pointer is valid due to type invariant on `CoherentAllocation`, @@ -468,7 +468,7 @@ impl<T: AsBytes + FromBytes> CoherentAllocation<T> { /// unsafe { alloc.write(buf, 0)?; } /// # Ok::<(), Error>(()) } /// ``` - pub unsafe fn write(&self, src: &[T], offset: usize) -> Result { + pub unsafe fn write(&mut self, src: &[T], offset: usize) -> Result { self.validate_range(offset, src.len())?; // SAFETY: // - The pointer is valid due to type invariant on `CoherentAllocation` @@ -556,7 +556,7 @@ impl<T: AsBytes + FromBytes> Drop for CoherentAllocation<T> { bindings::dma_free_attrs( self.dev.as_raw(), size, - self.cpu_addr as _, + self.cpu_addr.cast(), self.dma_handle, self.dma_attrs.as_raw(), ) diff --git a/rust/kernel/drm/device.rs b/rust/kernel/drm/device.rs index 32029fde55eb..3bb7c83966cf 100644 --- a/rust/kernel/drm/device.rs +++ b/rust/kernel/drm/device.rs @@ -83,13 +83,13 @@ impl<T: drm::Driver> Device<T> { major: T::INFO.major, minor: T::INFO.minor, patchlevel: T::INFO.patchlevel, - name: T::INFO.name.as_char_ptr() as *mut _, - desc: T::INFO.desc.as_char_ptr() as *mut _, + name: T::INFO.name.as_char_ptr().cast_mut(), + desc: T::INFO.desc.as_char_ptr().cast_mut(), driver_features: drm::driver::FEAT_GEM, ioctls: T::IOCTLS.as_ptr(), num_ioctls: T::IOCTLS.len() as i32, - fops: &Self::GEM_FOPS as _, + fops: &Self::GEM_FOPS, }; const GEM_FOPS: bindings::file_operations = drm::gem::create_fops(); @@ -135,11 +135,9 @@ impl<T: drm::Driver> Device<T> { /// /// `ptr` must be a valid pointer to a `struct device` embedded in `Self`. unsafe fn from_drm_device(ptr: *const bindings::drm_device) -> *mut Self { - let ptr: *const Opaque<bindings::drm_device> = ptr.cast(); - // SAFETY: By the safety requirements of this function `ptr` is a valid pointer to a // `struct drm_device` embedded in `Self`. - unsafe { crate::container_of!(ptr, Self, dev) }.cast_mut() + unsafe { crate::container_of!(Opaque::cast_from(ptr), Self, dev) }.cast_mut() } /// Not intended to be called externally, except via declare_drm_ioctls!() diff --git a/rust/kernel/drm/gem/mod.rs b/rust/kernel/drm/gem/mod.rs index a24c9a2fc201..b71821cfb5ea 100644 --- a/rust/kernel/drm/gem/mod.rs +++ b/rust/kernel/drm/gem/mod.rs @@ -125,11 +125,9 @@ impl<T: DriverObject> IntoGEMObject for Object<T> { } unsafe fn from_raw<'a>(self_ptr: *mut bindings::drm_gem_object) -> &'a Self { - let self_ptr: *mut Opaque<bindings::drm_gem_object> = self_ptr.cast(); - // SAFETY: `obj` is guaranteed to be in an `Object<T>` via the safety contract of this // function - unsafe { &*crate::container_of!(self_ptr, Object<T>, obj) } + unsafe { &*crate::container_of!(Opaque::cast_from(self_ptr), Object<T>, obj) } } } diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index 083c7b068cf4..a41de293dcd1 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -6,10 +6,10 @@ use crate::{ alloc::{layout::LayoutError, AllocError}, + fmt, str::CStr, }; -use core::fmt; use core::num::NonZeroI32; use core::num::TryFromIntError; use core::str::Utf8Error; @@ -154,7 +154,7 @@ impl Error { /// Returns the error encoded as a pointer. pub fn to_ptr<T>(self) -> *mut T { // SAFETY: `self.0` is a valid error due to its invariant. - unsafe { bindings::ERR_PTR(self.0.get() as _) as *mut _ } + unsafe { bindings::ERR_PTR(self.0.get() as crate::ffi::c_long).cast() } } /// Returns a string representing the error, if one exists. @@ -189,7 +189,7 @@ impl fmt::Debug for Error { Some(name) => f .debug_tuple( // SAFETY: These strings are ASCII-only. - unsafe { core::str::from_utf8_unchecked(name) }, + unsafe { core::str::from_utf8_unchecked(name.to_bytes()) }, ) .finish(), } @@ -220,8 +220,8 @@ impl From<LayoutError> for Error { } } -impl From<core::fmt::Error> for Error { - fn from(_: core::fmt::Error) -> Error { +impl From<fmt::Error> for Error { + fn from(_: fmt::Error) -> Error { code::EINVAL } } diff --git a/rust/kernel/firmware.rs b/rust/kernel/firmware.rs index 4fe621f35716..1abab5b2f052 100644 --- a/rust/kernel/firmware.rs +++ b/rust/kernel/firmware.rs @@ -62,10 +62,11 @@ impl Firmware { fn request_internal(name: &CStr, dev: &Device, func: FwFunc) -> Result<Self> { let mut fw: *mut bindings::firmware = core::ptr::null_mut(); let pfw: *mut *mut bindings::firmware = &mut fw; + let pfw: *mut *const bindings::firmware = pfw.cast(); // SAFETY: `pfw` is a valid pointer to a NULL initialized `bindings::firmware` pointer. // `name` and `dev` are valid as by their type invariants. - let ret = unsafe { func.0(pfw as _, name.as_char_ptr(), dev.as_raw()) }; + let ret = unsafe { func.0(pfw, name.as_char_ptr(), dev.as_raw()) }; if ret != 0 { return Err(Error::from_errno(ret)); } @@ -139,7 +140,7 @@ unsafe impl Sync for Firmware {} /// Typically, such contracts would be enforced by a trait, however traits do not (yet) support /// const functions. /// -/// # Example +/// # Examples /// /// ``` /// # mod module_firmware_test { @@ -181,7 +182,7 @@ unsafe impl Sync for Firmware {} /// module! { /// type: MyModule, /// name: "module_firmware_test", -/// author: "Rust for Linux", +/// authors: ["Rust for Linux"], /// description: "module_firmware! test module", /// license: "GPL", /// } @@ -261,7 +262,7 @@ impl<const N: usize> ModInfoBuilder<N> { /// Append path components to the [`ModInfoBuilder`] instance. Paths need to be separated /// with [`ModInfoBuilder::new_entry`]. /// - /// # Example + /// # Examples /// /// ``` /// use kernel::firmware::ModInfoBuilder; diff --git a/rust/kernel/fmt.rs b/rust/kernel/fmt.rs new file mode 100644 index 000000000000..0306e8388968 --- /dev/null +++ b/rust/kernel/fmt.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Formatting utilities. +//! +//! This module is intended to be used in place of `core::fmt` in kernel code. + +pub use core::fmt::{Arguments, Debug, Display, Error, Formatter, Result, Write}; diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs index 72d84fb0e266..35fd5db35c46 100644 --- a/rust/kernel/fs/file.rs +++ b/rust/kernel/fs/file.rs @@ -366,7 +366,7 @@ impl core::ops::Deref for File { // // By the type invariants, there are no `fdget_pos` calls that did not take the // `f_pos_lock` mutex. - unsafe { LocalFile::from_raw_file(self as *const File as *const bindings::file) } + unsafe { LocalFile::from_raw_file(core::ptr::from_ref(self).cast()) } } } diff --git a/rust/kernel/generated_arch_reachable_asm.rs.S b/rust/kernel/generated_arch_reachable_asm.rs.S new file mode 100644 index 000000000000..3886a9ad3a99 --- /dev/null +++ b/rust/kernel/generated_arch_reachable_asm.rs.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/bug.h> + +// Cut here. + +::kernel::concat_literals!(ARCH_WARN_REACHABLE) diff --git a/rust/kernel/generated_arch_warn_asm.rs.S b/rust/kernel/generated_arch_warn_asm.rs.S new file mode 100644 index 000000000000..409eb4c2d3a1 --- /dev/null +++ b/rust/kernel/generated_arch_warn_asm.rs.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/bug.h> + +// Cut here. + +::kernel::concat_literals!(ARCH_WARN_ASM("{file}", "{line}", "{flags}", "{size}")) diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs index 21ef202ab0db..4949047af8d7 100644 --- a/rust/kernel/init.rs +++ b/rust/kernel/init.rs @@ -29,15 +29,15 @@ //! //! ## General Examples //! -//! ```rust,ignore -//! # #![allow(clippy::disallowed_names)] +//! ```rust +//! # #![expect(clippy::disallowed_names, clippy::undocumented_unsafe_blocks)] //! use kernel::types::Opaque; //! use pin_init::pin_init_from_closure; //! //! // assume we have some `raw_foo` type in C: //! #[repr(C)] //! struct RawFoo([u8; 16]); -//! extern { +//! extern "C" { //! fn init_foo(_: *mut RawFoo); //! } //! @@ -66,25 +66,17 @@ //! }); //! ``` //! -//! ```rust,ignore -//! # #![allow(unreachable_pub, clippy::disallowed_names)] +//! ```rust +//! # #![expect(unreachable_pub, clippy::disallowed_names)] //! use kernel::{prelude::*, types::Opaque}; //! use core::{ptr::addr_of_mut, marker::PhantomPinned, pin::Pin}; //! # mod bindings { -//! # #![allow(non_camel_case_types)] +//! # #![expect(non_camel_case_types, clippy::missing_safety_doc)] //! # pub struct foo; //! # pub unsafe fn init_foo(_ptr: *mut foo) {} //! # pub unsafe fn destroy_foo(_ptr: *mut foo) {} //! # pub unsafe fn enable_foo(_ptr: *mut foo, _flags: u32) -> i32 { 0 } //! # } -//! # // `Error::from_errno` is `pub(crate)` in the `kernel` crate, thus provide a workaround. -//! # trait FromErrno { -//! # fn from_errno(errno: core::ffi::c_int) -> Error { -//! # // Dummy error that can be constructed outside the `kernel` crate. -//! # Error::from(core::fmt::Error) -//! # } -//! # } -//! # impl FromErrno for Error {} //! /// # Invariants //! /// //! /// `foo` is always initialized @@ -108,13 +100,13 @@ //! let foo = addr_of_mut!((*slot).foo); //! //! // Initialize the `foo` -//! bindings::init_foo(Opaque::raw_get(foo)); +//! bindings::init_foo(Opaque::cast_into(foo)); //! //! // Try to enable it. -//! let err = bindings::enable_foo(Opaque::raw_get(foo), flags); +//! let err = bindings::enable_foo(Opaque::cast_into(foo), flags); //! if err != 0 { //! // Enabling has failed, first clean up the foo and then return the error. -//! bindings::destroy_foo(Opaque::raw_get(foo)); +//! bindings::destroy_foo(Opaque::cast_into(foo)); //! return Err(Error::from_errno(err)); //! } //! @@ -206,7 +198,7 @@ pub trait InPlaceInit<T>: Sized { /// /// ```rust /// use kernel::error::Error; -/// use pin_init::zeroed; +/// use pin_init::init_zeroed; /// struct BigBuf { /// big: KBox<[u8; 1024 * 1024 * 1024]>, /// small: [u8; 1024 * 1024], @@ -215,7 +207,7 @@ pub trait InPlaceInit<T>: Sized { /// impl BigBuf { /// fn new() -> impl Init<Self, Error> { /// try_init!(Self { -/// big: KBox::init(zeroed(), GFP_KERNEL)?, +/// big: KBox::init(init_zeroed(), GFP_KERNEL)?, /// small: [0; 1024 * 1024], /// }? Error) /// } @@ -264,7 +256,7 @@ macro_rules! try_init { /// ```rust /// # #![feature(new_uninit)] /// use kernel::error::Error; -/// use pin_init::zeroed; +/// use pin_init::init_zeroed; /// #[pin_data] /// struct BigBuf { /// big: KBox<[u8; 1024 * 1024 * 1024]>, @@ -275,7 +267,7 @@ macro_rules! try_init { /// impl BigBuf { /// fn new() -> impl PinInit<Self, Error> { /// try_pin_init!(Self { -/// big: KBox::init(zeroed(), GFP_KERNEL)?, +/// big: KBox::init(init_zeroed(), GFP_KERNEL)?, /// small: [0; 1024 * 1024], /// ptr: core::ptr::null_mut(), /// }? Error) diff --git a/rust/kernel/io.rs b/rust/kernel/io.rs index b7fc759f8b5d..03b467722b86 100644 --- a/rust/kernel/io.rs +++ b/rust/kernel/io.rs @@ -5,7 +5,7 @@ //! C header: [`include/asm-generic/io.h`](srctree/include/asm-generic/io.h) use crate::error::{code::EINVAL, Result}; -use crate::{bindings, build_assert}; +use crate::{bindings, build_assert, ffi::c_void}; pub mod mem; pub mod resource; @@ -48,7 +48,7 @@ impl<const SIZE: usize> IoRaw<SIZE> { } } -/// IO-mapped memory, starting at the base address @addr and spanning @maxlen bytes. +/// IO-mapped memory region. /// /// The creator (usually a subsystem / bus such as PCI) is responsible for creating the /// mapping, performing an additional region request etc. @@ -61,7 +61,7 @@ impl<const SIZE: usize> IoRaw<SIZE> { /// # Examples /// /// ```no_run -/// # use kernel::{bindings, io::{Io, IoRaw}}; +/// # use kernel::{bindings, ffi::c_void, io::{Io, IoRaw}}; /// # use core::ops::Deref; /// /// // See also [`pci::Bar`] for a real example. @@ -75,19 +75,19 @@ impl<const SIZE: usize> IoRaw<SIZE> { /// unsafe fn new(paddr: usize) -> Result<Self>{ /// // SAFETY: By the safety requirements of this function [`paddr`, `paddr` + `SIZE`) is /// // valid for `ioremap`. -/// let addr = unsafe { bindings::ioremap(paddr as _, SIZE as _) }; +/// let addr = unsafe { bindings::ioremap(paddr as bindings::phys_addr_t, SIZE) }; /// if addr.is_null() { /// return Err(ENOMEM); /// } /// -/// Ok(IoMem(IoRaw::new(addr as _, SIZE)?)) +/// Ok(IoMem(IoRaw::new(addr as usize, SIZE)?)) /// } /// } /// /// impl<const SIZE: usize> Drop for IoMem<SIZE> { /// fn drop(&mut self) { /// // SAFETY: `self.0.addr()` is guaranteed to be properly mapped by `Self::new`. -/// unsafe { bindings::iounmap(self.0.addr() as _); }; +/// unsafe { bindings::iounmap(self.0.addr() as *mut c_void); }; /// } /// } /// @@ -124,7 +124,7 @@ macro_rules! define_read { let addr = self.io_addr_assert::<$type_name>(offset); // SAFETY: By the type invariant `addr` is a valid address for MMIO operations. - unsafe { bindings::$c_fn(addr as _) } + unsafe { bindings::$c_fn(addr as *const c_void) } } /// Read IO data from a given offset. @@ -136,7 +136,7 @@ macro_rules! define_read { let addr = self.io_addr::<$type_name>(offset)?; // SAFETY: By the type invariant `addr` is a valid address for MMIO operations. - Ok(unsafe { bindings::$c_fn(addr as _) }) + Ok(unsafe { bindings::$c_fn(addr as *const c_void) }) } }; } @@ -153,7 +153,7 @@ macro_rules! define_write { let addr = self.io_addr_assert::<$type_name>(offset); // SAFETY: By the type invariant `addr` is a valid address for MMIO operations. - unsafe { bindings::$c_fn(value, addr as _, ) } + unsafe { bindings::$c_fn(value, addr as *mut c_void) } } /// Write IO data from a given offset. @@ -165,7 +165,7 @@ macro_rules! define_write { let addr = self.io_addr::<$type_name>(offset)?; // SAFETY: By the type invariant `addr` is a valid address for MMIO operations. - unsafe { bindings::$c_fn(value, addr as _) } + unsafe { bindings::$c_fn(value, addr as *mut c_void) } Ok(()) } }; diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs index b9e65905e121..41efd87595d6 100644 --- a/rust/kernel/kunit.rs +++ b/rust/kernel/kunit.rs @@ -7,7 +7,10 @@ //! Reference: <https://docs.kernel.org/dev-tools/kunit/index.html> use crate::prelude::*; -use core::{ffi::c_void, fmt}; +use core::fmt; + +#[cfg(CONFIG_PRINTK)] +use crate::c_str; /// Prints a KUnit error-level message. /// @@ -19,8 +22,8 @@ pub fn err(args: fmt::Arguments<'_>) { #[cfg(CONFIG_PRINTK)] unsafe { bindings::_printk( - c"\x013%pA".as_ptr() as _, - &args as *const _ as *const c_void, + c_str!("\x013%pA").as_char_ptr(), + core::ptr::from_ref(&args).cast::<c_void>(), ); } } @@ -35,8 +38,8 @@ pub fn info(args: fmt::Arguments<'_>) { #[cfg(CONFIG_PRINTK)] unsafe { bindings::_printk( - c"\x016%pA".as_ptr() as _, - &args as *const _ as *const c_void, + c_str!("\x016%pA").as_char_ptr(), + core::ptr::from_ref(&args).cast::<c_void>(), ); } } diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index c2d1b9375205..ed53169e795c 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -62,8 +62,10 @@ pub mod acpi; pub mod alloc; #[cfg(CONFIG_AUXILIARY_BUS)] pub mod auxiliary; +pub mod bits; #[cfg(CONFIG_BLOCK)] pub mod block; +pub mod bug; #[doc(hidden)] pub mod build_assert; pub mod clk; @@ -85,6 +87,7 @@ pub mod error; pub mod faux; #[cfg(CONFIG_RUST_FW_LOADER_ABSTRACTIONS)] pub mod firmware; +pub mod fmt; pub mod fs; pub mod init; pub mod io; @@ -213,6 +216,13 @@ fn panic(info: &core::panic::PanicInfo<'_>) -> ! { /// Produces a pointer to an object from a pointer to one of its fields. /// +/// If you encounter a type mismatch due to the [`Opaque`] type, then use [`Opaque::cast_into`] or +/// [`Opaque::cast_from`] to resolve the mismatch. +/// +/// [`Opaque`]: crate::types::Opaque +/// [`Opaque::cast_into`]: crate::types::Opaque::cast_into +/// [`Opaque::cast_from`]: crate::types::Opaque::cast_from +/// /// # Safety /// /// The pointer passed to this macro, and the pointer returned by this macro, must both be in diff --git a/rust/kernel/list.rs b/rust/kernel/list.rs index c391c30b80f8..44e5219cfcbc 100644 --- a/rust/kernel/list.rs +++ b/rust/kernel/list.rs @@ -57,14 +57,11 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// } /// } /// -/// impl_has_list_links! { -/// impl HasListLinks<0> for BasicItem { self.links } -/// } /// impl_list_arc_safe! { /// impl ListArcSafe<0> for BasicItem { untracked; } /// } /// impl_list_item! { -/// impl ListItem<0> for BasicItem { using ListLinks; } +/// impl ListItem<0> for BasicItem { using ListLinks { self.links }; } /// } /// /// // Create a new empty list. @@ -82,9 +79,9 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// // [15, 10, 30] /// { /// let mut iter = list.iter(); -/// assert_eq!(iter.next().unwrap().value, 15); -/// assert_eq!(iter.next().unwrap().value, 10); -/// assert_eq!(iter.next().unwrap().value, 30); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 15); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 10); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 30); /// assert!(iter.next().is_none()); /// /// // Verify the length of the list. @@ -93,9 +90,9 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// /// // Pop the items from the list using `pop_back()` and verify the content. /// { -/// assert_eq!(list.pop_back().unwrap().value, 30); -/// assert_eq!(list.pop_back().unwrap().value, 10); -/// assert_eq!(list.pop_back().unwrap().value, 15); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value, 30); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value, 10); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value, 15); /// } /// /// // Insert 3 elements using `push_front()`. @@ -107,9 +104,9 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// // [30, 10, 15] /// { /// let mut iter = list.iter(); -/// assert_eq!(iter.next().unwrap().value, 30); -/// assert_eq!(iter.next().unwrap().value, 10); -/// assert_eq!(iter.next().unwrap().value, 15); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 30); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 10); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 15); /// assert!(iter.next().is_none()); /// /// // Verify the length of the list. @@ -118,8 +115,8 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// /// // Pop the items from the list using `pop_front()` and verify the content. /// { -/// assert_eq!(list.pop_front().unwrap().value, 30); -/// assert_eq!(list.pop_front().unwrap().value, 10); +/// assert_eq!(list.pop_front().ok_or(EINVAL)?.value, 30); +/// assert_eq!(list.pop_front().ok_or(EINVAL)?.value, 10); /// } /// /// // Push `list2` to `list` through `push_all_back()`. @@ -135,9 +132,9 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// // list: [15, 25, 35] /// // list2: [] /// let mut iter = list.iter(); -/// assert_eq!(iter.next().unwrap().value, 15); -/// assert_eq!(iter.next().unwrap().value, 25); -/// assert_eq!(iter.next().unwrap().value, 35); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 15); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 25); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value, 35); /// assert!(iter.next().is_none()); /// assert!(list2.is_empty()); /// } @@ -284,7 +281,7 @@ impl<const ID: u64> ListLinks<ID> { #[inline] unsafe fn fields(me: *mut Self) -> *mut ListLinksFields { // SAFETY: The caller promises that the pointer is valid. - unsafe { Opaque::raw_get(ptr::addr_of!((*me).inner)) } + unsafe { Opaque::cast_into(ptr::addr_of!((*me).inner)) } } /// # Safety @@ -320,9 +317,6 @@ unsafe impl<T: ?Sized + Send, const ID: u64> Send for ListLinksSelfPtr<T, ID> {} unsafe impl<T: ?Sized + Sync, const ID: u64> Sync for ListLinksSelfPtr<T, ID> {} impl<T: ?Sized, const ID: u64> ListLinksSelfPtr<T, ID> { - /// The offset from the [`ListLinks`] to the self pointer field. - pub const LIST_LINKS_SELF_PTR_OFFSET: usize = core::mem::offset_of!(Self, self_ptr); - /// Creates a new initializer for this type. pub fn new() -> impl PinInit<Self> { // INVARIANT: Pin-init initializers can't be used on an existing `Arc`, so this value will @@ -337,6 +331,16 @@ impl<T: ?Sized, const ID: u64> ListLinksSelfPtr<T, ID> { self_ptr: Opaque::uninit(), } } + + /// Returns a pointer to the self pointer. + /// + /// # Safety + /// + /// The provided pointer must point at a valid struct of type `Self`. + pub unsafe fn raw_get_self_ptr(me: *const Self) -> *const Opaque<*const T> { + // SAFETY: The caller promises that the pointer is valid. + unsafe { ptr::addr_of!((*me).self_ptr) } + } } impl<T: ?Sized + ListItem<ID>, const ID: u64> List<T, ID> { @@ -711,14 +715,11 @@ impl<'a, T: ?Sized + ListItem<ID>, const ID: u64> Iterator for Iter<'a, T, ID> { /// } /// } /// -/// kernel::list::impl_has_list_links! { -/// impl HasListLinks<0> for ListItem { self.links } -/// } /// kernel::list::impl_list_arc_safe! { /// impl ListArcSafe<0> for ListItem { untracked; } /// } /// kernel::list::impl_list_item! { -/// impl ListItem<0> for ListItem { using ListLinks; } +/// impl ListItem<0> for ListItem { using ListLinks { self.links }; } /// } /// /// // Use a cursor to remove the first element with the given value. @@ -809,11 +810,11 @@ impl<'a, T: ?Sized + ListItem<ID>, const ID: u64> Iterator for Iter<'a, T, ID> { /// merge_sorted(&mut list, list2); /// /// let mut items = list.into_iter(); -/// assert_eq!(items.next().unwrap().value, 10); -/// assert_eq!(items.next().unwrap().value, 11); -/// assert_eq!(items.next().unwrap().value, 12); -/// assert_eq!(items.next().unwrap().value, 13); -/// assert_eq!(items.next().unwrap().value, 14); +/// assert_eq!(items.next().ok_or(EINVAL)?.value, 10); +/// assert_eq!(items.next().ok_or(EINVAL)?.value, 11); +/// assert_eq!(items.next().ok_or(EINVAL)?.value, 12); +/// assert_eq!(items.next().ok_or(EINVAL)?.value, 13); +/// assert_eq!(items.next().ok_or(EINVAL)?.value, 14); /// assert!(items.next().is_none()); /// # Result::<(), Error>::Ok(()) /// ``` diff --git a/rust/kernel/list/impl_list_item_mod.rs b/rust/kernel/list/impl_list_item_mod.rs index a0438537cee1..202bc6f97c13 100644 --- a/rust/kernel/list/impl_list_item_mod.rs +++ b/rust/kernel/list/impl_list_item_mod.rs @@ -4,60 +4,48 @@ //! Helpers for implementing list traits safely. -use crate::list::ListLinks; - -/// Declares that this type has a `ListLinks<ID>` field at a fixed offset. +/// Declares that this type has a [`ListLinks<ID>`] field. /// -/// This trait is only used to help implement `ListItem` safely. If `ListItem` is implemented +/// This trait is only used to help implement [`ListItem`] safely. If [`ListItem`] is implemented /// manually, then this trait is not needed. Use the [`impl_has_list_links!`] macro to implement /// this trait. /// /// # Safety /// -/// All values of this type must have a `ListLinks<ID>` field at the given offset. +/// The methods on this trait must have exactly the behavior that the definitions given below have. /// -/// The behavior of `raw_get_list_links` must not be changed. +/// [`ListLinks<ID>`]: crate::list::ListLinks +/// [`ListItem`]: crate::list::ListItem pub unsafe trait HasListLinks<const ID: u64 = 0> { - /// The offset of the `ListLinks` field. - const OFFSET: usize; - - /// Returns a pointer to the [`ListLinks<T, ID>`] field. + /// Returns a pointer to the [`ListLinks<ID>`] field. /// /// # Safety /// /// The provided pointer must point at a valid struct of type `Self`. /// - /// [`ListLinks<T, ID>`]: ListLinks - // We don't really need this method, but it's necessary for the implementation of - // `impl_has_list_links!` to be correct. - #[inline] - unsafe fn raw_get_list_links(ptr: *mut Self) -> *mut ListLinks<ID> { - // SAFETY: The caller promises that the pointer is valid. The implementer promises that the - // `OFFSET` constant is correct. - unsafe { (ptr as *mut u8).add(Self::OFFSET) as *mut ListLinks<ID> } - } + /// [`ListLinks<ID>`]: crate::list::ListLinks + unsafe fn raw_get_list_links(ptr: *mut Self) -> *mut crate::list::ListLinks<ID>; } /// Implements the [`HasListLinks`] trait for the given type. #[macro_export] macro_rules! impl_has_list_links { - ($(impl$(<$($implarg:ident),*>)? + ($(impl$({$($generics:tt)*})? HasListLinks$(<$id:tt>)? - for $self:ident $(<$($selfarg:ty),*>)? + for $self:ty { self$(.$field:ident)* } )*) => {$( // SAFETY: The implementation of `raw_get_list_links` only compiles if the field has the // right type. - // - // The behavior of `raw_get_list_links` is not changed since the `addr_of_mut!` macro is - // equivalent to the pointer offset operation in the trait definition. - unsafe impl$(<$($implarg),*>)? $crate::list::HasListLinks$(<$id>)? for - $self $(<$($selfarg),*>)? - { - const OFFSET: usize = ::core::mem::offset_of!(Self, $($field).*) as usize; - + unsafe impl$(<$($generics)*>)? $crate::list::HasListLinks$(<$id>)? for $self { #[inline] unsafe fn raw_get_list_links(ptr: *mut Self) -> *mut $crate::list::ListLinks$(<$id>)? { + // Statically ensure that `$(.field)*` doesn't follow any pointers. + // + // Cannot be `const` because `$self` may contain generics and E0401 says constants + // "can't use {`Self`,generic parameters} from outer item". + if false { let _: usize = ::core::mem::offset_of!(Self, $($field).*); } + // SAFETY: The caller promises that the pointer is not dangling. We know that this // expression doesn't follow any pointers, as the `offset_of!` invocation above // would otherwise not compile. @@ -68,12 +56,16 @@ macro_rules! impl_has_list_links { } pub use impl_has_list_links; -/// Declares that the `ListLinks<ID>` field in this struct is inside a `ListLinksSelfPtr<T, ID>`. +/// Declares that the [`ListLinks<ID>`] field in this struct is inside a +/// [`ListLinksSelfPtr<T, ID>`]. /// /// # Safety /// -/// The `ListLinks<ID>` field of this struct at the offset `HasListLinks<ID>::OFFSET` must be -/// inside a `ListLinksSelfPtr<T, ID>`. +/// The [`ListLinks<ID>`] field of this struct at [`HasListLinks<ID>::raw_get_list_links`] must be +/// inside a [`ListLinksSelfPtr<T, ID>`]. +/// +/// [`ListLinks<ID>`]: crate::list::ListLinks +/// [`ListLinksSelfPtr<T, ID>`]: crate::list::ListLinksSelfPtr pub unsafe trait HasSelfPtr<T: ?Sized, const ID: u64 = 0> where Self: HasListLinks<ID>, @@ -83,27 +75,21 @@ where /// Implements the [`HasListLinks`] and [`HasSelfPtr`] traits for the given type. #[macro_export] macro_rules! impl_has_list_links_self_ptr { - ($(impl$({$($implarg:tt)*})? + ($(impl$({$($generics:tt)*})? HasSelfPtr<$item_type:ty $(, $id:tt)?> - for $self:ident $(<$($selfarg:ty),*>)? - { self.$field:ident } + for $self:ty + { self$(.$field:ident)* } )*) => {$( // SAFETY: The implementation of `raw_get_list_links` only compiles if the field has the // right type. - unsafe impl$(<$($implarg)*>)? $crate::list::HasSelfPtr<$item_type $(, $id)?> for - $self $(<$($selfarg),*>)? - {} - - unsafe impl$(<$($implarg)*>)? $crate::list::HasListLinks$(<$id>)? for - $self $(<$($selfarg),*>)? - { - const OFFSET: usize = ::core::mem::offset_of!(Self, $field) as usize; + unsafe impl$(<$($generics)*>)? $crate::list::HasSelfPtr<$item_type $(, $id)?> for $self {} + unsafe impl$(<$($generics)*>)? $crate::list::HasListLinks$(<$id>)? for $self { #[inline] unsafe fn raw_get_list_links(ptr: *mut Self) -> *mut $crate::list::ListLinks$(<$id>)? { // SAFETY: The caller promises that the pointer is not dangling. let ptr: *mut $crate::list::ListLinksSelfPtr<$item_type $(, $id)?> = - unsafe { ::core::ptr::addr_of_mut!((*ptr).$field) }; + unsafe { ::core::ptr::addr_of_mut!((*ptr)$(.$field)*) }; ptr.cast() } } @@ -117,15 +103,95 @@ pub use impl_has_list_links_self_ptr; /// implement that trait. /// /// [`ListItem`]: crate::list::ListItem +/// +/// # Examples +/// +/// ``` +/// #[pin_data] +/// struct SimpleListItem { +/// value: u32, +/// #[pin] +/// links: kernel::list::ListLinks, +/// } +/// +/// kernel::list::impl_list_arc_safe! { +/// impl ListArcSafe<0> for SimpleListItem { untracked; } +/// } +/// +/// kernel::list::impl_list_item! { +/// impl ListItem<0> for SimpleListItem { using ListLinks { self.links }; } +/// } +/// +/// struct ListLinksHolder { +/// inner: kernel::list::ListLinks, +/// } +/// +/// #[pin_data] +/// struct ComplexListItem<T, U> { +/// value: Result<T, U>, +/// #[pin] +/// links: ListLinksHolder, +/// } +/// +/// kernel::list::impl_list_arc_safe! { +/// impl{T, U} ListArcSafe<0> for ComplexListItem<T, U> { untracked; } +/// } +/// +/// kernel::list::impl_list_item! { +/// impl{T, U} ListItem<0> for ComplexListItem<T, U> { using ListLinks { self.links.inner }; } +/// } +/// ``` +/// +/// ``` +/// #[pin_data] +/// struct SimpleListItem { +/// value: u32, +/// #[pin] +/// links: kernel::list::ListLinksSelfPtr<SimpleListItem>, +/// } +/// +/// kernel::list::impl_list_arc_safe! { +/// impl ListArcSafe<0> for SimpleListItem { untracked; } +/// } +/// +/// kernel::list::impl_list_item! { +/// impl ListItem<0> for SimpleListItem { using ListLinksSelfPtr { self.links }; } +/// } +/// +/// struct ListLinksSelfPtrHolder<T, U> { +/// inner: kernel::list::ListLinksSelfPtr<ComplexListItem<T, U>>, +/// } +/// +/// #[pin_data] +/// struct ComplexListItem<T, U> { +/// value: Result<T, U>, +/// #[pin] +/// links: ListLinksSelfPtrHolder<T, U>, +/// } +/// +/// kernel::list::impl_list_arc_safe! { +/// impl{T, U} ListArcSafe<0> for ComplexListItem<T, U> { untracked; } +/// } +/// +/// kernel::list::impl_list_item! { +/// impl{T, U} ListItem<0> for ComplexListItem<T, U> { +/// using ListLinksSelfPtr { self.links.inner }; +/// } +/// } +/// ``` #[macro_export] macro_rules! impl_list_item { ( - $(impl$({$($generics:tt)*})? ListItem<$num:tt> for $t:ty { - using ListLinks; + $(impl$({$($generics:tt)*})? ListItem<$num:tt> for $self:ty { + using ListLinks { self$(.$field:ident)* }; })* ) => {$( + $crate::list::impl_has_list_links! { + impl$({$($generics)*})? HasListLinks<$num> for $self { self$(.$field)* } + } + // SAFETY: See GUARANTEES comment on each method. - unsafe impl$(<$($generics)*>)? $crate::list::ListItem<$num> for $t { + unsafe impl$(<$($generics)*>)? $crate::list::ListItem<$num> for $self { // GUARANTEES: // * This returns the same pointer as `prepare_to_insert` because `prepare_to_insert` // is implemented in terms of `view_links`. @@ -139,20 +205,19 @@ macro_rules! impl_list_item { } // GUARANTEES: - // * `me` originates from the most recent call to `prepare_to_insert`, which just added - // `offset` to the pointer passed to `prepare_to_insert`. This method subtracts - // `offset` from `me` so it returns the pointer originally passed to - // `prepare_to_insert`. + // * `me` originates from the most recent call to `prepare_to_insert`, which calls + // `raw_get_list_link`, which is implemented using `addr_of_mut!((*self)$(.$field)*)`. + // This method uses `container_of` to perform the inverse operation, so it returns the + // pointer originally passed to `prepare_to_insert`. // * The pointer remains valid until the next call to `post_remove` because the caller // of the most recent call to `prepare_to_insert` promised to retain ownership of the // `ListArc` containing `Self` until the next call to `post_remove`. The value cannot // be destroyed while a `ListArc` reference exists. unsafe fn view_value(me: *mut $crate::list::ListLinks<$num>) -> *const Self { - let offset = <Self as $crate::list::HasListLinks<$num>>::OFFSET; // SAFETY: `me` originates from the most recent call to `prepare_to_insert`, so it - // points at the field at offset `offset` in a value of type `Self`. Thus, - // subtracting `offset` from `me` is still in-bounds of the allocation. - unsafe { (me as *const u8).sub(offset) as *const Self } + // points at the field `$field` in a value of type `Self`. Thus, reversing that + // operation is still in-bounds of the allocation. + $crate::container_of!(me, Self, $($field).*) } // GUARANTEES: @@ -169,27 +234,30 @@ macro_rules! impl_list_item { } // GUARANTEES: - // * `me` originates from the most recent call to `prepare_to_insert`, which just added - // `offset` to the pointer passed to `prepare_to_insert`. This method subtracts - // `offset` from `me` so it returns the pointer originally passed to - // `prepare_to_insert`. + // * `me` originates from the most recent call to `prepare_to_insert`, which calls + // `raw_get_list_link`, which is implemented using `addr_of_mut!((*self)$(.$field)*)`. + // This method uses `container_of` to perform the inverse operation, so it returns the + // pointer originally passed to `prepare_to_insert`. unsafe fn post_remove(me: *mut $crate::list::ListLinks<$num>) -> *const Self { - let offset = <Self as $crate::list::HasListLinks<$num>>::OFFSET; // SAFETY: `me` originates from the most recent call to `prepare_to_insert`, so it - // points at the field at offset `offset` in a value of type `Self`. Thus, - // subtracting `offset` from `me` is still in-bounds of the allocation. - unsafe { (me as *const u8).sub(offset) as *const Self } + // points at the field `$field` in a value of type `Self`. Thus, reversing that + // operation is still in-bounds of the allocation. + $crate::container_of!(me, Self, $($field).*) } } )*}; ( - $(impl$({$($generics:tt)*})? ListItem<$num:tt> for $t:ty { - using ListLinksSelfPtr; + $(impl$({$($generics:tt)*})? ListItem<$num:tt> for $self:ty { + using ListLinksSelfPtr { self$(.$field:ident)* }; })* ) => {$( + $crate::list::impl_has_list_links_self_ptr! { + impl$({$($generics)*})? HasSelfPtr<$self> for $self { self$(.$field)* } + } + // SAFETY: See GUARANTEES comment on each method. - unsafe impl$(<$($generics)*>)? $crate::list::ListItem<$num> for $t { + unsafe impl$(<$($generics)*>)? $crate::list::ListItem<$num> for $self { // GUARANTEES: // This implementation of `ListItem` will not give out exclusive access to the same // `ListLinks` several times because calls to `prepare_to_insert` and `post_remove` @@ -202,14 +270,16 @@ macro_rules! impl_list_item { // SAFETY: The caller promises that `me` points at a valid value of type `Self`. let links_field = unsafe { <Self as $crate::list::ListItem<$num>>::view_links(me) }; - let spoff = $crate::list::ListLinksSelfPtr::<Self, $num>::LIST_LINKS_SELF_PTR_OFFSET; - // Goes via the offset as the field is private. - // - // SAFETY: The constant is equal to `offset_of!(ListLinksSelfPtr, self_ptr)`, so - // the pointer stays in bounds of the allocation. - let self_ptr = unsafe { (links_field as *const u8).add(spoff) } - as *const $crate::types::Opaque<*const Self>; - let cell_inner = $crate::types::Opaque::raw_get(self_ptr); + let container = $crate::container_of!( + links_field, $crate::list::ListLinksSelfPtr<Self, $num>, inner + ); + + // SAFETY: By the same reasoning above, `links_field` is a valid pointer. + let self_ptr = unsafe { + $crate::list::ListLinksSelfPtr::raw_get_self_ptr(container) + }; + + let cell_inner = $crate::types::Opaque::cast_into(self_ptr); // SAFETY: This value is not accessed in any other places than `prepare_to_insert`, // `post_remove`, or `view_value`. By the safety requirements of those methods, @@ -228,7 +298,9 @@ macro_rules! impl_list_item { // this value is not in a list. unsafe fn view_links(me: *const Self) -> *mut $crate::list::ListLinks<$num> { // SAFETY: The caller promises that `me` points at a valid value of type `Self`. - unsafe { <Self as HasListLinks<$num>>::raw_get_list_links(me.cast_mut()) } + unsafe { + <Self as $crate::list::HasListLinks<$num>>::raw_get_list_links(me.cast_mut()) + } } // This function is also used as the implementation of `post_remove`, so the caller @@ -247,12 +319,17 @@ macro_rules! impl_list_item { // `ListArc` containing `Self` until the next call to `post_remove`. The value cannot // be destroyed while a `ListArc` reference exists. unsafe fn view_value(links_field: *mut $crate::list::ListLinks<$num>) -> *const Self { - let spoff = $crate::list::ListLinksSelfPtr::<Self, $num>::LIST_LINKS_SELF_PTR_OFFSET; - // SAFETY: The constant is equal to `offset_of!(ListLinksSelfPtr, self_ptr)`, so - // the pointer stays in bounds of the allocation. - let self_ptr = unsafe { (links_field as *const u8).add(spoff) } - as *const ::core::cell::UnsafeCell<*const Self>; - let cell_inner = ::core::cell::UnsafeCell::raw_get(self_ptr); + let container = $crate::container_of!( + links_field, $crate::list::ListLinksSelfPtr<Self, $num>, inner + ); + + // SAFETY: By the same reasoning above, `links_field` is a valid pointer. + let self_ptr = unsafe { + $crate::list::ListLinksSelfPtr::raw_get_self_ptr(container) + }; + + let cell_inner = $crate::types::Opaque::cast_into(self_ptr); + // SAFETY: This is not a data race, because the only function that writes to this // value is `prepare_to_insert`, but by the safety requirements the // `prepare_to_insert` method may not be called in parallel with `view_value` or diff --git a/rust/kernel/miscdevice.rs b/rust/kernel/miscdevice.rs index a1eb5737e3cb..6373fe183b27 100644 --- a/rust/kernel/miscdevice.rs +++ b/rust/kernel/miscdevice.rs @@ -33,7 +33,7 @@ impl MiscDeviceOptions { pub const fn into_raw<T: MiscDevice>(self) -> bindings::miscdevice { // SAFETY: All zeros is valid for this C type. let mut result: bindings::miscdevice = unsafe { MaybeUninit::zeroed().assume_init() }; - result.minor = bindings::MISC_DYNAMIC_MINOR as _; + result.minor = bindings::MISC_DYNAMIC_MINOR as ffi::c_int; result.name = self.name.as_char_ptr(); result.fops = MiscdeviceVTable::<T>::build(); result @@ -222,7 +222,7 @@ impl<T: MiscDevice> MiscdeviceVTable<T> { // type. // // SAFETY: The open call of a file can access the private data. - unsafe { (*raw_file).private_data = ptr.into_foreign().cast() }; + unsafe { (*raw_file).private_data = ptr.into_foreign() }; 0 } @@ -233,7 +233,7 @@ impl<T: MiscDevice> MiscdeviceVTable<T> { /// must be associated with a `MiscDeviceRegistration<T>`. unsafe extern "C" fn release(_inode: *mut bindings::inode, file: *mut bindings::file) -> c_int { // SAFETY: The release call of a file owns the private data. - let private = unsafe { (*file).private_data }.cast(); + let private = unsafe { (*file).private_data }; // SAFETY: The release call of a file owns the private data. let ptr = unsafe { <T::Ptr as ForeignOwnable>::from_foreign(private) }; @@ -277,7 +277,7 @@ impl<T: MiscDevice> MiscdeviceVTable<T> { /// `file` must be a valid file that is associated with a `MiscDeviceRegistration<T>`. unsafe extern "C" fn ioctl(file: *mut bindings::file, cmd: c_uint, arg: c_ulong) -> c_long { // SAFETY: The ioctl call of a file can access the private data. - let private = unsafe { (*file).private_data }.cast(); + let private = unsafe { (*file).private_data }; // SAFETY: Ioctl calls can borrow the private data of the file. let device = unsafe { <T::Ptr as ForeignOwnable>::borrow(private) }; @@ -302,7 +302,7 @@ impl<T: MiscDevice> MiscdeviceVTable<T> { arg: c_ulong, ) -> c_long { // SAFETY: The compat ioctl call of a file can access the private data. - let private = unsafe { (*file).private_data }.cast(); + let private = unsafe { (*file).private_data }; // SAFETY: Ioctl calls can borrow the private data of the file. let device = unsafe { <T::Ptr as ForeignOwnable>::borrow(private) }; @@ -323,7 +323,7 @@ impl<T: MiscDevice> MiscdeviceVTable<T> { /// - `seq_file` must be a valid `struct seq_file` that we can write to. unsafe extern "C" fn show_fdinfo(seq_file: *mut bindings::seq_file, file: *mut bindings::file) { // SAFETY: The release call of a file owns the private data. - let private = unsafe { (*file).private_data }.cast(); + let private = unsafe { (*file).private_data }; // SAFETY: Ioctl calls can borrow the private data of the file. let device = unsafe { <T::Ptr as ForeignOwnable>::borrow(private) }; // SAFETY: diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index 31803674aecc..6086ca981b06 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -392,80 +392,80 @@ pub mod flags { use crate::bindings; /// No flags are set. - pub const NONE: vm_flags_t = bindings::VM_NONE as _; + pub const NONE: vm_flags_t = bindings::VM_NONE as vm_flags_t; /// Mapping allows reads. - pub const READ: vm_flags_t = bindings::VM_READ as _; + pub const READ: vm_flags_t = bindings::VM_READ as vm_flags_t; /// Mapping allows writes. - pub const WRITE: vm_flags_t = bindings::VM_WRITE as _; + pub const WRITE: vm_flags_t = bindings::VM_WRITE as vm_flags_t; /// Mapping allows execution. - pub const EXEC: vm_flags_t = bindings::VM_EXEC as _; + pub const EXEC: vm_flags_t = bindings::VM_EXEC as vm_flags_t; /// Mapping is shared. - pub const SHARED: vm_flags_t = bindings::VM_SHARED as _; + pub const SHARED: vm_flags_t = bindings::VM_SHARED as vm_flags_t; /// Mapping may be updated to allow reads. - pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as _; + pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as vm_flags_t; /// Mapping may be updated to allow writes. - pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as _; + pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as vm_flags_t; /// Mapping may be updated to allow execution. - pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as _; + pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as vm_flags_t; /// Mapping may be updated to be shared. - pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as _; + pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as vm_flags_t; /// Page-ranges managed without `struct page`, just pure PFN. - pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as _; + pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as vm_flags_t; /// Memory mapped I/O or similar. - pub const IO: vm_flags_t = bindings::VM_IO as _; + pub const IO: vm_flags_t = bindings::VM_IO as vm_flags_t; /// Do not copy this vma on fork. - pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as _; + pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as vm_flags_t; /// Cannot expand with mremap(). - pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as _; + pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as vm_flags_t; /// Lock the pages covered when they are faulted in. - pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as _; + pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as vm_flags_t; /// Is a VM accounted object. - pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as _; + pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as vm_flags_t; /// Should the VM suppress accounting. - pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as _; + pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as vm_flags_t; /// Huge TLB Page VM. - pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as _; + pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as vm_flags_t; /// Synchronous page faults. (DAX-specific) - pub const SYNC: vm_flags_t = bindings::VM_SYNC as _; + pub const SYNC: vm_flags_t = bindings::VM_SYNC as vm_flags_t; /// Architecture-specific flag. - pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as _; + pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as vm_flags_t; /// Wipe VMA contents in child on fork. - pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as _; + pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as vm_flags_t; /// Do not include in the core dump. - pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as _; + pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as vm_flags_t; /// Not soft dirty clean area. - pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as _; + pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as vm_flags_t; /// Can contain `struct page` and pure PFN pages. - pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as _; + pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as vm_flags_t; /// MADV_HUGEPAGE marked this vma. - pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as _; + pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as vm_flags_t; /// MADV_NOHUGEPAGE marked this vma. - pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as _; + pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as vm_flags_t; /// KSM may merge identical pages. - pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as _; + pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as vm_flags_t; } diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index 602609027aa6..7de5cc7a0eee 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -142,7 +142,7 @@ impl Device { // SAFETY: The struct invariant ensures that we may access // this field without additional synchronization. let bit_field = unsafe { &(*self.0.get())._bitfield_1 }; - bit_field.get(13, 1) == bindings::AUTONEG_ENABLE as u64 + bit_field.get(13, 1) == u64::from(bindings::AUTONEG_ENABLE) } /// Gets the current auto-negotiation state. @@ -419,7 +419,7 @@ impl<T: Driver> Adapter<T> { // where we hold `phy_device->lock`, so the accessors on // `Device` are okay to call. let dev = unsafe { Device::from_raw(phydev) }; - T::match_phy_device(dev) as i32 + T::match_phy_device(dev).into() } /// # Safety diff --git a/rust/kernel/of.rs b/rust/kernel/of.rs index 0888469bddb7..b76b35265df2 100644 --- a/rust/kernel/of.rs +++ b/rust/kernel/of.rs @@ -27,7 +27,7 @@ unsafe impl RawDeviceIdIndex for DeviceId { const DRIVER_DATA_OFFSET: usize = core::mem::offset_of!(bindings::of_device_id, data); fn index(&self) -> usize { - self.0.data as _ + self.0.data as usize } } @@ -39,10 +39,10 @@ impl DeviceId { // SAFETY: FFI type is valid to be zero-initialized. let mut of: bindings::of_device_id = unsafe { core::mem::zeroed() }; - // TODO: Use `clone_from_slice` once the corresponding types do match. + // TODO: Use `copy_from_slice` once stabilized for `const`. let mut i = 0; while i < src.len() { - of.compatible[i] = src[i] as _; + of.compatible[i] = src[i]; i += 1; } diff --git a/rust/kernel/opp.rs b/rust/kernel/opp.rs index 846583da9a2f..08126035d2c6 100644 --- a/rust/kernel/opp.rs +++ b/rust/kernel/opp.rs @@ -92,7 +92,7 @@ fn to_c_str_array(names: &[CString]) -> Result<KVec<*const u8>> { let mut list = KVec::with_capacity(names.len() + 1, GFP_KERNEL)?; for name in names.iter() { - list.push(name.as_ptr() as _, GFP_KERNEL)?; + list.push(name.as_ptr().cast(), GFP_KERNEL)?; } list.push(ptr::null(), GFP_KERNEL)?; @@ -103,7 +103,7 @@ fn to_c_str_array(names: &[CString]) -> Result<KVec<*const u8>> { /// /// Represents voltage in microvolts, wrapping a [`c_ulong`] value. /// -/// ## Examples +/// # Examples /// /// ``` /// use kernel::opp::MicroVolt; @@ -128,7 +128,7 @@ impl From<MicroVolt> for c_ulong { /// /// Represents power in microwatts, wrapping a [`c_ulong`] value. /// -/// ## Examples +/// # Examples /// /// ``` /// use kernel::opp::MicroWatt; @@ -153,7 +153,7 @@ impl From<MicroWatt> for c_ulong { /// /// The associated [`OPP`] is automatically removed when the [`Token`] is dropped. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to create an [`OPP`] dynamically. /// @@ -202,7 +202,7 @@ impl Drop for Token { /// Rust abstraction for the C `struct dev_pm_opp_data`, used to define operating performance /// points (OPPs) dynamically. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to create an [`OPP`] with [`Data`]. /// @@ -254,7 +254,7 @@ impl Data { /// [`OPP`] search options. /// -/// ## Examples +/// # Examples /// /// Defines how to search for an [`OPP`] in a [`Table`] relative to a frequency. /// @@ -326,7 +326,7 @@ impl Drop for ConfigToken { /// /// Rust abstraction for the C `struct dev_pm_opp_config`. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to set OPP property-name configuration for a [`Device`]. /// @@ -345,7 +345,7 @@ impl Drop for ConfigToken { /// impl ConfigOps for Driver {} /// /// fn configure(dev: &ARef<Device>) -> Result<ConfigToken> { -/// let name = CString::try_from_fmt(fmt!("{}", "slow"))?; +/// let name = CString::try_from_fmt(fmt!("slow"))?; /// /// // The OPP configuration is cleared once the [`ConfigToken`] goes out of scope. /// Config::<Driver>::new() @@ -569,7 +569,7 @@ impl<T: ConfigOps + Default> Config<T> { /// /// Instances of this type are reference-counted. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to get OPP [`Table`] for a [`Cpumask`] and set its /// frequency. @@ -1011,7 +1011,7 @@ impl Drop for Table { /// /// A reference to the [`OPP`], &[`OPP`], isn't refcounted by the Rust code. /// -/// ## Examples +/// # Examples /// /// The following example demonstrates how to get [`OPP`] corresponding to a frequency value and /// configure the device with it. diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 44a2f3d2884a..887ee611b553 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -98,7 +98,7 @@ impl<T: Driver + 'static> Adapter<T> { /// Declares a kernel module that exposes a single PCI driver. /// -/// # Example +/// # Examples /// ///```ignore /// kernel::module_pci_driver! { @@ -170,7 +170,7 @@ unsafe impl RawDeviceIdIndex for DeviceId { const DRIVER_DATA_OFFSET: usize = core::mem::offset_of!(bindings::pci_device_id, driver_data); fn index(&self) -> usize { - self.0.driver_data as _ + self.0.driver_data } } @@ -193,7 +193,7 @@ macro_rules! pci_device_table { /// The PCI driver trait. /// -/// # Example +/// # Examples /// ///``` /// # use kernel::{bindings, device::Core, pci}; @@ -205,7 +205,10 @@ macro_rules! pci_device_table { /// MODULE_PCI_TABLE, /// <MyDriver as pci::Driver>::IdInfo, /// [ -/// (pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, bindings::PCI_ANY_ID as _), ()) +/// ( +/// pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, bindings::PCI_ANY_ID as u32), +/// (), +/// ) /// ] /// ); /// @@ -344,7 +347,7 @@ impl<const SIZE: usize> Bar<SIZE> { // `ioptr` is valid by the safety requirements. // `num` is valid by the safety requirements. unsafe { - bindings::pci_iounmap(pdev.as_raw(), ioptr as _); + bindings::pci_iounmap(pdev.as_raw(), ioptr as *mut kernel::ffi::c_void); bindings::pci_release_region(pdev.as_raw(), num); } } diff --git a/rust/kernel/platform.rs b/rust/kernel/platform.rs index b4d3087aff52..8f028c76f9fa 100644 --- a/rust/kernel/platform.rs +++ b/rust/kernel/platform.rs @@ -132,7 +132,7 @@ macro_rules! module_platform_driver { /// /// Drivers must implement this trait in order to get a platform driver registered. /// -/// # Example +/// # Examples /// ///``` /// # use kernel::{acpi, bindings, c_str, device::Core, of, platform}; diff --git a/rust/kernel/prelude.rs b/rust/kernel/prelude.rs index 2f30a398dddd..25fe97aafd02 100644 --- a/rust/kernel/prelude.rs +++ b/rust/kernel/prelude.rs @@ -31,9 +31,9 @@ pub use super::{build_assert, build_error}; // `super::std_vendor` is hidden, which makes the macro inline for some reason. #[doc(no_inline)] pub use super::dbg; -pub use super::fmt; pub use super::{dev_alert, dev_crit, dev_dbg, dev_emerg, dev_err, dev_info, dev_notice, dev_warn}; pub use super::{pr_alert, pr_crit, pr_debug, pr_emerg, pr_err, pr_info, pr_notice, pr_warn}; +pub use core::format_args as fmt; pub use super::{try_init, try_pin_init}; @@ -46,3 +46,5 @@ pub use super::{str::CStr, ThisModule}; pub use super::init::InPlaceInit; pub use super::current; + +pub use super::uaccess::UserPtr; diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs index 9783d960a97a..2d743d78d220 100644 --- a/rust/kernel/print.rs +++ b/rust/kernel/print.rs @@ -8,10 +8,10 @@ use crate::{ ffi::{c_char, c_void}, + fmt, prelude::*, str::RawFormatter, }; -use core::fmt; // Called from `vsprintf` with format specifier `%pA`. #[expect(clippy::missing_safety_doc)] @@ -25,7 +25,7 @@ unsafe extern "C" fn rust_fmt_argument( // SAFETY: The C contract guarantees that `buf` is valid if it's less than `end`. let mut w = unsafe { RawFormatter::from_ptrs(buf.cast(), end.cast()) }; // SAFETY: TODO. - let _ = w.write_fmt(unsafe { *(ptr as *const fmt::Arguments<'_>) }); + let _ = w.write_fmt(unsafe { *ptr.cast::<fmt::Arguments<'_>>() }); w.pos().cast() } @@ -109,7 +109,7 @@ pub unsafe fn call_printk( bindings::_printk( format_string.as_ptr(), module_name.as_ptr(), - &args as *const _ as *const c_void, + core::ptr::from_ref(&args).cast::<c_void>(), ); } } @@ -129,7 +129,7 @@ pub fn call_printk_cont(args: fmt::Arguments<'_>) { unsafe { bindings::_printk( format_strings::CONT.as_ptr(), - &args as *const _ as *const c_void, + core::ptr::from_ref(&args).cast::<c_void>(), ); } } @@ -149,7 +149,7 @@ macro_rules! print_macro ( // takes borrows on the arguments, but does not extend the scope of temporaries. // Therefore, a `match` expression is used to keep them around, since // the scrutinee is kept until the end of the `match`. - match format_args!($($arg)+) { + match $crate::prelude::fmt!($($arg)+) { // SAFETY: This hidden macro should only be called by the documented // printing macros which ensure the format string is one of the fixed // ones. All `__LOG_PREFIX`s are null-terminated as they are generated @@ -168,7 +168,7 @@ macro_rules! print_macro ( // The `CONT` case. ($format_string:path, true, $($arg:tt)+) => ( $crate::print::call_printk_cont( - format_args!($($arg)+), + $crate::prelude::fmt!($($arg)+), ); ); ); diff --git a/rust/kernel/rbtree.rs b/rust/kernel/rbtree.rs index 8d978c896747..b8fe6be6fcc4 100644 --- a/rust/kernel/rbtree.rs +++ b/rust/kernel/rbtree.rs @@ -191,6 +191,12 @@ impl<K, V> RBTree<K, V> { } } + /// Returns true if this tree is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.root.rb_node.is_null() + } + /// Returns an iterator over the tree nodes, sorted by key. pub fn iter(&self) -> Iter<'_, K, V> { Iter { @@ -769,23 +775,14 @@ impl<'a, K, V> Cursor<'a, K, V> { // the tree cannot change. By the tree invariant, all nodes are valid. unsafe { bindings::rb_erase(&mut (*this).links, addr_of_mut!(self.tree.root)) }; - let current = match (prev, next) { - (_, Some(next)) => next, - (Some(prev), None) => prev, - (None, None) => { - return (None, node); - } - }; + // INVARIANT: + // - `current` is a valid node in the [`RBTree`] pointed to by `self.tree`. + let cursor = next.or(prev).map(|current| Self { + current, + tree: self.tree, + }); - ( - // INVARIANT: - // - `current` is a valid node in the [`RBTree`] pointed to by `self.tree`. - Some(Self { - current, - tree: self.tree, - }), - node, - ) + (cursor, node) } /// Remove the previous node, returning it if it exists. diff --git a/rust/kernel/revocable.rs b/rust/kernel/revocable.rs index 46768b374656..0f4ae673256d 100644 --- a/rust/kernel/revocable.rs +++ b/rust/kernel/revocable.rs @@ -233,6 +233,10 @@ impl<T> PinnedDrop for Revocable<T> { /// /// The RCU read-side lock is held while the guard is alive. pub struct RevocableGuard<'a, T> { + // This can't use the `&'a T` type because references that appear in function arguments must + // not become dangling during the execution of the function, which can happen if the + // `RevocableGuard` is passed as a function argument and then dropped during execution of the + // function. data_ref: *const T, _rcu_guard: rcu::Guard, _p: PhantomData<&'a ()>, diff --git a/rust/kernel/seq_file.rs b/rust/kernel/seq_file.rs index 7a9403eb6e5b..8f199b1a3bb1 100644 --- a/rust/kernel/seq_file.rs +++ b/rust/kernel/seq_file.rs @@ -37,7 +37,7 @@ impl SeqFile { bindings::seq_printf( self.inner.get(), c_str!("%pA").as_char_ptr(), - &args as *const _ as *const crate::ffi::c_void, + core::ptr::from_ref(&args).cast::<crate::ffi::c_void>(), ); } } diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index a927db8e079c..6c892550c0ba 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -3,7 +3,7 @@ //! String representations. use crate::alloc::{flags::*, AllocError, KVec}; -use core::fmt::{self, Write}; +use crate::fmt::{self, Write}; use core::ops::{self, Deref, DerefMut, Index}; use crate::prelude::*; @@ -29,7 +29,7 @@ impl BStr { #[inline] pub const fn from_bytes(bytes: &[u8]) -> &Self { // SAFETY: `BStr` is transparent to `[u8]`. - unsafe { &*(bytes as *const [u8] as *const BStr) } + unsafe { &*(core::ptr::from_ref(bytes) as *const BStr) } } /// Strip a prefix from `self`. Delegates to [`slice::strip_prefix`]. @@ -54,14 +54,14 @@ impl fmt::Display for BStr { /// Formats printable ASCII characters, escaping the rest. /// /// ``` - /// # use kernel::{fmt, b_str, str::{BStr, CString}}; + /// # use kernel::{prelude::fmt, b_str, str::{BStr, CString}}; /// let ascii = b_str!("Hello, BStr!"); - /// let s = CString::try_from_fmt(fmt!("{}", ascii))?; - /// assert_eq!(s.as_bytes(), "Hello, BStr!".as_bytes()); + /// let s = CString::try_from_fmt(fmt!("{ascii}"))?; + /// assert_eq!(s.to_bytes(), "Hello, BStr!".as_bytes()); /// /// let non_ascii = b_str!("🦀"); - /// let s = CString::try_from_fmt(fmt!("{}", non_ascii))?; - /// assert_eq!(s.as_bytes(), "\\xf0\\x9f\\xa6\\x80".as_bytes()); + /// let s = CString::try_from_fmt(fmt!("{non_ascii}"))?; + /// assert_eq!(s.to_bytes(), "\\xf0\\x9f\\xa6\\x80".as_bytes()); /// # Ok::<(), kernel::error::Error>(()) /// ``` fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -85,15 +85,15 @@ impl fmt::Debug for BStr { /// escaping the rest. /// /// ``` - /// # use kernel::{fmt, b_str, str::{BStr, CString}}; + /// # use kernel::{prelude::fmt, b_str, str::{BStr, CString}}; /// // Embedded double quotes are escaped. /// let ascii = b_str!("Hello, \"BStr\"!"); - /// let s = CString::try_from_fmt(fmt!("{:?}", ascii))?; - /// assert_eq!(s.as_bytes(), "\"Hello, \\\"BStr\\\"!\"".as_bytes()); + /// let s = CString::try_from_fmt(fmt!("{ascii:?}"))?; + /// assert_eq!(s.to_bytes(), "\"Hello, \\\"BStr\\\"!\"".as_bytes()); /// /// let non_ascii = b_str!("😺"); - /// let s = CString::try_from_fmt(fmt!("{:?}", non_ascii))?; - /// assert_eq!(s.as_bytes(), "\"\\xf0\\x9f\\x98\\xba\"".as_bytes()); + /// let s = CString::try_from_fmt(fmt!("{non_ascii:?}"))?; + /// assert_eq!(s.to_bytes(), "\"\\xf0\\x9f\\x98\\xba\"".as_bytes()); /// # Ok::<(), kernel::error::Error>(()) /// ``` fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -175,6 +175,15 @@ macro_rules! b_str { }}; } +/// Returns a C pointer to the string. +// It is a free function rather than a method on an extension trait because: +// +// - error[E0379]: functions in trait impls cannot be declared const +#[inline] +pub const fn as_char_ptr_in_const_context(c_str: &CStr) -> *const c_char { + c_str.0.as_ptr() +} + /// Possible errors when using conversion functions in [`CStr`]. #[derive(Debug, Clone, Copy)] pub enum CStrConvertError { @@ -232,12 +241,12 @@ impl CStr { /// last at least `'a`. When `CStr` is alive, the memory pointed by `ptr` /// must not be mutated. #[inline] - pub unsafe fn from_char_ptr<'a>(ptr: *const crate::ffi::c_char) -> &'a Self { + pub unsafe fn from_char_ptr<'a>(ptr: *const c_char) -> &'a Self { // SAFETY: The safety precondition guarantees `ptr` is a valid pointer // to a `NUL`-terminated C string. let len = unsafe { bindings::strlen(ptr) } + 1; // SAFETY: Lifetime guaranteed by the safety precondition. - let bytes = unsafe { core::slice::from_raw_parts(ptr as _, len) }; + let bytes = unsafe { core::slice::from_raw_parts(ptr.cast(), len) }; // SAFETY: As `len` is returned by `strlen`, `bytes` does not contain interior `NUL`. // As we have added 1 to `len`, the last byte is known to be `NUL`. unsafe { Self::from_bytes_with_nul_unchecked(bytes) } @@ -290,27 +299,49 @@ impl CStr { #[inline] pub unsafe fn from_bytes_with_nul_unchecked_mut(bytes: &mut [u8]) -> &mut CStr { // SAFETY: Properties of `bytes` guaranteed by the safety precondition. - unsafe { &mut *(bytes as *mut [u8] as *mut CStr) } + unsafe { &mut *(core::ptr::from_mut(bytes) as *mut CStr) } } /// Returns a C pointer to the string. + /// + /// Using this function in a const context is deprecated in favor of + /// [`as_char_ptr_in_const_context`] in preparation for replacing `CStr` with `core::ffi::CStr` + /// which does not have this method. #[inline] - pub const fn as_char_ptr(&self) -> *const crate::ffi::c_char { - self.0.as_ptr() + pub const fn as_char_ptr(&self) -> *const c_char { + as_char_ptr_in_const_context(self) } /// Convert the string to a byte slice without the trailing `NUL` byte. #[inline] - pub fn as_bytes(&self) -> &[u8] { + pub fn to_bytes(&self) -> &[u8] { &self.0[..self.len()] } + /// Convert the string to a byte slice without the trailing `NUL` byte. + /// + /// This function is deprecated in favor of [`Self::to_bytes`] in preparation for replacing + /// `CStr` with `core::ffi::CStr` which does not have this method. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + self.to_bytes() + } + /// Convert the string to a byte slice containing the trailing `NUL` byte. #[inline] - pub const fn as_bytes_with_nul(&self) -> &[u8] { + pub const fn to_bytes_with_nul(&self) -> &[u8] { &self.0 } + /// Convert the string to a byte slice containing the trailing `NUL` byte. + /// + /// This function is deprecated in favor of [`Self::to_bytes_with_nul`] in preparation for + /// replacing `CStr` with `core::ffi::CStr` which does not have this method. + #[inline] + pub const fn as_bytes_with_nul(&self) -> &[u8] { + self.to_bytes_with_nul() + } + /// Yields a [`&str`] slice if the [`CStr`] contains valid UTF-8. /// /// If the contents of the [`CStr`] are valid UTF-8 data, this @@ -429,20 +460,20 @@ impl fmt::Display for CStr { /// /// ``` /// # use kernel::c_str; - /// # use kernel::fmt; + /// # use kernel::prelude::fmt; /// # use kernel::str::CStr; /// # use kernel::str::CString; /// let penguin = c_str!("🐧"); - /// let s = CString::try_from_fmt(fmt!("{}", penguin))?; - /// assert_eq!(s.as_bytes_with_nul(), "\\xf0\\x9f\\x90\\xa7\0".as_bytes()); + /// let s = CString::try_from_fmt(fmt!("{penguin}"))?; + /// assert_eq!(s.to_bytes_with_nul(), "\\xf0\\x9f\\x90\\xa7\0".as_bytes()); /// /// let ascii = c_str!("so \"cool\""); - /// let s = CString::try_from_fmt(fmt!("{}", ascii))?; - /// assert_eq!(s.as_bytes_with_nul(), "so \"cool\"\0".as_bytes()); + /// let s = CString::try_from_fmt(fmt!("{ascii}"))?; + /// assert_eq!(s.to_bytes_with_nul(), "so \"cool\"\0".as_bytes()); /// # Ok::<(), kernel::error::Error>(()) /// ``` fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for &c in self.as_bytes() { + for &c in self.to_bytes() { if (0x20..0x7f).contains(&c) { // Printable character. f.write_char(c as char)?; @@ -459,16 +490,16 @@ impl fmt::Debug for CStr { /// /// ``` /// # use kernel::c_str; - /// # use kernel::fmt; + /// # use kernel::prelude::fmt; /// # use kernel::str::CStr; /// # use kernel::str::CString; /// let penguin = c_str!("🐧"); - /// let s = CString::try_from_fmt(fmt!("{:?}", penguin))?; + /// let s = CString::try_from_fmt(fmt!("{penguin:?}"))?; /// assert_eq!(s.as_bytes_with_nul(), "\"\\xf0\\x9f\\x90\\xa7\"\0".as_bytes()); /// /// // Embedded double quotes are escaped. /// let ascii = c_str!("so \"cool\""); - /// let s = CString::try_from_fmt(fmt!("{:?}", ascii))?; + /// let s = CString::try_from_fmt(fmt!("{ascii:?}"))?; /// assert_eq!(s.as_bytes_with_nul(), "\"so \\\"cool\\\"\"\0".as_bytes()); /// # Ok::<(), kernel::error::Error>(()) /// ``` @@ -578,7 +609,7 @@ mod tests { macro_rules! format { ($($f:tt)*) => ({ - CString::try_from_fmt(::kernel::fmt!($($f)*))?.to_str()? + CString::try_from_fmt(fmt!($($f)*))?.to_str()? }) } @@ -728,9 +759,9 @@ impl RawFormatter { pub(crate) unsafe fn from_ptrs(pos: *mut u8, end: *mut u8) -> Self { // INVARIANT: The safety requirements guarantee the type invariants. Self { - beg: pos as _, - pos: pos as _, - end: end as _, + beg: pos as usize, + pos: pos as usize, + end: end as usize, } } @@ -755,7 +786,7 @@ impl RawFormatter { /// /// N.B. It may point to invalid memory. pub(crate) fn pos(&self) -> *mut u8 { - self.pos as _ + self.pos as *mut u8 } /// Returns the number of bytes written to the formatter. @@ -840,14 +871,14 @@ impl fmt::Write for Formatter { /// # Examples /// /// ``` -/// use kernel::{str::CString, fmt}; +/// use kernel::{str::CString, prelude::fmt}; /// /// let s = CString::try_from_fmt(fmt!("{}{}{}", "abc", 10, 20))?; -/// assert_eq!(s.as_bytes_with_nul(), "abc1020\0".as_bytes()); +/// assert_eq!(s.to_bytes_with_nul(), "abc1020\0".as_bytes()); /// /// let tmp = "testing"; /// let s = CString::try_from_fmt(fmt!("{tmp}{}", 123))?; -/// assert_eq!(s.as_bytes_with_nul(), "testing123\0".as_bytes()); +/// assert_eq!(s.to_bytes_with_nul(), "testing123\0".as_bytes()); /// /// // This fails because it has an embedded `NUL` byte. /// let s = CString::try_from_fmt(fmt!("a\0b{}", 123)); @@ -917,7 +948,7 @@ impl<'a> TryFrom<&'a CStr> for CString { fn try_from(cstr: &'a CStr) -> Result<CString, AllocError> { let mut buf = KVec::new(); - buf.extend_from_slice(cstr.as_bytes_with_nul(), GFP_KERNEL)?; + buf.extend_from_slice(cstr.to_bytes_with_nul(), GFP_KERNEL)?; // INVARIANT: The `CStr` and `CString` types have the same invariants for // the string data, and we copied it over without changes. @@ -930,9 +961,3 @@ impl fmt::Debug for CString { fmt::Debug::fmt(&**self, f) } } - -/// A convenience alias for [`core::format_args`]. -#[macro_export] -macro_rules! fmt { - ($($f:tt)*) => ( ::core::format_args!($($f)*) ) -} diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index c23a12639924..00f9b558a3ad 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -10,6 +10,7 @@ use crate::types::Opaque; use pin_init; mod arc; +pub mod aref; pub mod completion; mod condvar; pub mod lock; @@ -41,7 +42,7 @@ impl LockClassKey { /// Initializes a dynamically allocated lock class key. In the common case of using a /// statically allocated lock class key, the static_lock_class! macro should be used instead. /// - /// # Example + /// # Examples /// ``` /// # use kernel::c_str; /// # use kernel::alloc::KBox; @@ -95,8 +96,11 @@ impl PinnedDrop for LockClassKey { macro_rules! static_lock_class { () => {{ static CLASS: $crate::sync::LockClassKey = - // SAFETY: lockdep expects uninitialized memory when it's handed a statically allocated - // lock_class_key + // Lockdep expects uninitialized memory when it's handed a statically allocated `struct + // lock_class_key`. + // + // SAFETY: `LockClassKey` transparently wraps `Opaque` which permits uninitialized + // memory. unsafe { ::core::mem::MaybeUninit::uninit().assume_init() }; $crate::prelude::Pin::static_ref(&CLASS) }}; diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index c7af0aa48a0a..63a66761d0c7 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -19,12 +19,14 @@ use crate::{ alloc::{AllocError, Flags, KBox}, bindings, + ffi::c_void, init::InPlaceInit, try_init, types::{ForeignOwnable, Opaque}, }; use core::{ alloc::Layout, + borrow::{Borrow, BorrowMut}, fmt, marker::PhantomData, mem::{ManuallyDrop, MaybeUninit}, @@ -140,10 +142,9 @@ pub struct Arc<T: ?Sized> { _p: PhantomData<ArcInner<T>>, } -#[doc(hidden)] #[pin_data] #[repr(C)] -pub struct ArcInner<T: ?Sized> { +struct ArcInner<T: ?Sized> { refcount: Opaque<bindings::refcount_t>, data: T, } @@ -372,20 +373,22 @@ impl<T: ?Sized> Arc<T> { } } -// SAFETY: The `into_foreign` function returns a pointer that is well-aligned. +// SAFETY: The pointer returned by `into_foreign` comes from a well aligned +// pointer to `ArcInner<T>`. unsafe impl<T: 'static> ForeignOwnable for Arc<T> { - type PointedTo = ArcInner<T>; + const FOREIGN_ALIGN: usize = core::mem::align_of::<ArcInner<T>>(); + type Borrowed<'a> = ArcBorrow<'a, T>; type BorrowedMut<'a> = Self::Borrowed<'a>; - fn into_foreign(self) -> *mut Self::PointedTo { - ManuallyDrop::new(self).ptr.as_ptr() + fn into_foreign(self) -> *mut c_void { + ManuallyDrop::new(self).ptr.as_ptr().cast() } - unsafe fn from_foreign(ptr: *mut Self::PointedTo) -> Self { + unsafe fn from_foreign(ptr: *mut c_void) -> Self { // SAFETY: The safety requirements of this function ensure that `ptr` comes from a previous // call to `Self::into_foreign`. - let inner = unsafe { NonNull::new_unchecked(ptr) }; + let inner = unsafe { NonNull::new_unchecked(ptr.cast::<ArcInner<T>>()) }; // SAFETY: By the safety requirement of this function, we know that `ptr` came from // a previous call to `Arc::into_foreign`, which guarantees that `ptr` is valid and @@ -393,20 +396,20 @@ unsafe impl<T: 'static> ForeignOwnable for Arc<T> { unsafe { Self::from_inner(inner) } } - unsafe fn borrow<'a>(ptr: *mut Self::PointedTo) -> ArcBorrow<'a, T> { + unsafe fn borrow<'a>(ptr: *mut c_void) -> ArcBorrow<'a, T> { // SAFETY: The safety requirements of this function ensure that `ptr` comes from a previous // call to `Self::into_foreign`. - let inner = unsafe { NonNull::new_unchecked(ptr) }; + let inner = unsafe { NonNull::new_unchecked(ptr.cast::<ArcInner<T>>()) }; // SAFETY: The safety requirements of `from_foreign` ensure that the object remains alive // for the lifetime of the returned value. unsafe { ArcBorrow::new(inner) } } - unsafe fn borrow_mut<'a>(ptr: *mut Self::PointedTo) -> ArcBorrow<'a, T> { + unsafe fn borrow_mut<'a>(ptr: *mut c_void) -> ArcBorrow<'a, T> { // SAFETY: The safety requirements for `borrow_mut` are a superset of the safety // requirements for `borrow`. - unsafe { Self::borrow(ptr) } + unsafe { <Self as ForeignOwnable>::borrow(ptr) } } } @@ -426,6 +429,31 @@ impl<T: ?Sized> AsRef<T> for Arc<T> { } } +/// # Examples +/// +/// ``` +/// # use core::borrow::Borrow; +/// # use kernel::sync::Arc; +/// struct Foo<B: Borrow<u32>>(B); +/// +/// // Owned instance. +/// let owned = Foo(1); +/// +/// // Shared instance. +/// let arc = Arc::new(1, GFP_KERNEL)?; +/// let shared = Foo(arc.clone()); +/// +/// let i = 1; +/// // Borrowed from `i`. +/// let borrowed = Foo(&i); +/// # Ok::<(), Error>(()) +/// ``` +impl<T: ?Sized> Borrow<T> for Arc<T> { + fn borrow(&self) -> &T { + self.deref() + } +} + impl<T: ?Sized> Clone for Arc<T> { fn clone(&self) -> Self { // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is @@ -834,6 +862,56 @@ impl<T: ?Sized> DerefMut for UniqueArc<T> { } } +/// # Examples +/// +/// ``` +/// # use core::borrow::Borrow; +/// # use kernel::sync::UniqueArc; +/// struct Foo<B: Borrow<u32>>(B); +/// +/// // Owned instance. +/// let owned = Foo(1); +/// +/// // Owned instance using `UniqueArc`. +/// let arc = UniqueArc::new(1, GFP_KERNEL)?; +/// let shared = Foo(arc); +/// +/// let i = 1; +/// // Borrowed from `i`. +/// let borrowed = Foo(&i); +/// # Ok::<(), Error>(()) +/// ``` +impl<T: ?Sized> Borrow<T> for UniqueArc<T> { + fn borrow(&self) -> &T { + self.deref() + } +} + +/// # Examples +/// +/// ``` +/// # use core::borrow::BorrowMut; +/// # use kernel::sync::UniqueArc; +/// struct Foo<B: BorrowMut<u32>>(B); +/// +/// // Owned instance. +/// let owned = Foo(1); +/// +/// // Owned instance using `UniqueArc`. +/// let arc = UniqueArc::new(1, GFP_KERNEL)?; +/// let shared = Foo(arc); +/// +/// let mut i = 1; +/// // Borrowed from `i`. +/// let borrowed = Foo(&mut i); +/// # Ok::<(), Error>(()) +/// ``` +impl<T: ?Sized> BorrowMut<T> for UniqueArc<T> { + fn borrow_mut(&mut self) -> &mut T { + self.deref_mut() + } +} + impl<T: fmt::Display + ?Sized> fmt::Display for UniqueArc<T> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self.deref(), f) diff --git a/rust/kernel/sync/aref.rs b/rust/kernel/sync/aref.rs new file mode 100644 index 000000000000..dbd77bb68617 --- /dev/null +++ b/rust/kernel/sync/aref.rs @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Internal reference counting support. + +use core::{marker::PhantomData, mem::ManuallyDrop, ops::Deref, ptr::NonNull}; + +/// Types that are _always_ reference counted. +/// +/// It allows such types to define their own custom ref increment and decrement functions. +/// Additionally, it allows users to convert from a shared reference `&T` to an owned reference +/// [`ARef<T>`]. +/// +/// This is usually implemented by wrappers to existing structures on the C side of the code. For +/// Rust code, the recommendation is to use [`Arc`](crate::sync::Arc) to create reference-counted +/// instances of a type. +/// +/// # Safety +/// +/// Implementers must ensure that increments to the reference count keep the object alive in memory +/// at least until matching decrements are performed. +/// +/// Implementers must also ensure that all instances are reference-counted. (Otherwise they +/// won't be able to honour the requirement that [`AlwaysRefCounted::inc_ref`] keep the object +/// alive.) +pub unsafe trait AlwaysRefCounted { + /// Increments the reference count on the object. + fn inc_ref(&self); + + /// Decrements the reference count on the object. + /// + /// Frees the object when the count reaches zero. + /// + /// # Safety + /// + /// Callers must ensure that there was a previous matching increment to the reference count, + /// and that the object is no longer used after its reference count is decremented (as it may + /// result in the object being freed), unless the caller owns another increment on the refcount + /// (e.g., it calls [`AlwaysRefCounted::inc_ref`] twice, then calls + /// [`AlwaysRefCounted::dec_ref`] once). + unsafe fn dec_ref(obj: NonNull<Self>); +} + +/// An owned reference to an always-reference-counted object. +/// +/// The object's reference count is automatically decremented when an instance of [`ARef`] is +/// dropped. It is also automatically incremented when a new instance is created via +/// [`ARef::clone`]. +/// +/// # Invariants +/// +/// The pointer stored in `ptr` is non-null and valid for the lifetime of the [`ARef`] instance. In +/// particular, the [`ARef`] instance owns an increment on the underlying object's reference count. +pub struct ARef<T: AlwaysRefCounted> { + ptr: NonNull<T>, + _p: PhantomData<T>, +} + +// SAFETY: It is safe to send `ARef<T>` to another thread when the underlying `T` is `Sync` because +// it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, it needs +// `T` to be `Send` because any thread that has an `ARef<T>` may ultimately access `T` using a +// mutable reference, for example, when the reference count reaches zero and `T` is dropped. +unsafe impl<T: AlwaysRefCounted + Sync + Send> Send for ARef<T> {} + +// SAFETY: It is safe to send `&ARef<T>` to another thread when the underlying `T` is `Sync` +// because it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, +// it needs `T` to be `Send` because any thread that has a `&ARef<T>` may clone it and get an +// `ARef<T>` on that thread, so the thread may ultimately access `T` using a mutable reference, for +// example, when the reference count reaches zero and `T` is dropped. +unsafe impl<T: AlwaysRefCounted + Sync + Send> Sync for ARef<T> {} + +impl<T: AlwaysRefCounted> ARef<T> { + /// Creates a new instance of [`ARef`]. + /// + /// It takes over an increment of the reference count on the underlying object. + /// + /// # Safety + /// + /// Callers must ensure that the reference count was incremented at least once, and that they + /// are properly relinquishing one increment. That is, if there is only one increment, callers + /// must not use the underlying object anymore -- it is only safe to do so via the newly + /// created [`ARef`]. + pub unsafe fn from_raw(ptr: NonNull<T>) -> Self { + // INVARIANT: The safety requirements guarantee that the new instance now owns the + // increment on the refcount. + Self { + ptr, + _p: PhantomData, + } + } + + /// Consumes the `ARef`, returning a raw pointer. + /// + /// This function does not change the refcount. After calling this function, the caller is + /// responsible for the refcount previously managed by the `ARef`. + /// + /// # Examples + /// + /// ``` + /// use core::ptr::NonNull; + /// use kernel::types::{ARef, AlwaysRefCounted}; + /// + /// struct Empty {} + /// + /// # // SAFETY: TODO. + /// unsafe impl AlwaysRefCounted for Empty { + /// fn inc_ref(&self) {} + /// unsafe fn dec_ref(_obj: NonNull<Self>) {} + /// } + /// + /// let mut data = Empty {}; + /// let ptr = NonNull::<Empty>::new(&mut data).unwrap(); + /// # // SAFETY: TODO. + /// let data_ref: ARef<Empty> = unsafe { ARef::from_raw(ptr) }; + /// let raw_ptr: NonNull<Empty> = ARef::into_raw(data_ref); + /// + /// assert_eq!(ptr, raw_ptr); + /// ``` + pub fn into_raw(me: Self) -> NonNull<T> { + ManuallyDrop::new(me).ptr + } +} + +impl<T: AlwaysRefCounted> Clone for ARef<T> { + fn clone(&self) -> Self { + self.inc_ref(); + // SAFETY: We just incremented the refcount above. + unsafe { Self::from_raw(self.ptr) } + } +} + +impl<T: AlwaysRefCounted> Deref for ARef<T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: The type invariants guarantee that the object is valid. + unsafe { self.ptr.as_ref() } + } +} + +impl<T: AlwaysRefCounted> From<&T> for ARef<T> { + fn from(b: &T) -> Self { + b.inc_ref(); + // SAFETY: We just incremented the refcount above. + unsafe { Self::from_raw(NonNull::from(b)) } + } +} + +impl<T: AlwaysRefCounted> Drop for ARef<T> { + fn drop(&mut self) { + // SAFETY: The type invariants guarantee that the `ARef` owns the reference we're about to + // decrement. + unsafe { T::dec_ref(self.ptr) }; + } +} diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs index a8089a98da9e..64c8dcf548d6 100644 --- a/rust/kernel/time.rs +++ b/rust/kernel/time.rs @@ -24,6 +24,9 @@ //! C header: [`include/linux/jiffies.h`](srctree/include/linux/jiffies.h). //! C header: [`include/linux/ktime.h`](srctree/include/linux/ktime.h). +use core::marker::PhantomData; + +pub mod delay; pub mod hrtimer; /// The number of nanoseconds per microsecond. @@ -49,26 +52,141 @@ pub fn msecs_to_jiffies(msecs: Msecs) -> Jiffies { unsafe { bindings::__msecs_to_jiffies(msecs) } } +/// Trait for clock sources. +/// +/// Selection of the clock source depends on the use case. In some cases the usage of a +/// particular clock is mandatory, e.g. in network protocols, filesystems. In other +/// cases the user of the clock has to decide which clock is best suited for the +/// purpose. In most scenarios clock [`Monotonic`] is the best choice as it +/// provides a accurate monotonic notion of time (leap second smearing ignored). +pub trait ClockSource { + /// The kernel clock ID associated with this clock source. + /// + /// This constant corresponds to the C side `clockid_t` value. + const ID: bindings::clockid_t; + + /// Get the current time from the clock source. + /// + /// The function must return a value in the range from 0 to `KTIME_MAX`. + fn ktime_get() -> bindings::ktime_t; +} + +/// A monotonically increasing clock. +/// +/// A nonsettable system-wide clock that represents monotonic time since as +/// described by POSIX, "some unspecified point in the past". On Linux, that +/// point corresponds to the number of seconds that the system has been +/// running since it was booted. +/// +/// The CLOCK_MONOTONIC clock is not affected by discontinuous jumps in the +/// CLOCK_REAL (e.g., if the system administrator manually changes the +/// clock), but is affected by frequency adjustments. This clock does not +/// count time that the system is suspended. +pub struct Monotonic; + +impl ClockSource for Monotonic { + const ID: bindings::clockid_t = bindings::CLOCK_MONOTONIC as bindings::clockid_t; + + fn ktime_get() -> bindings::ktime_t { + // SAFETY: It is always safe to call `ktime_get()` outside of NMI context. + unsafe { bindings::ktime_get() } + } +} + +/// A settable system-wide clock that measures real (i.e., wall-clock) time. +/// +/// Setting this clock requires appropriate privileges. This clock is +/// affected by discontinuous jumps in the system time (e.g., if the system +/// administrator manually changes the clock), and by frequency adjustments +/// performed by NTP and similar applications via adjtime(3), adjtimex(2), +/// clock_adjtime(2), and ntp_adjtime(3). This clock normally counts the +/// number of seconds since 1970-01-01 00:00:00 Coordinated Universal Time +/// (UTC) except that it ignores leap seconds; near a leap second it may be +/// adjusted by leap second smearing to stay roughly in sync with UTC. Leap +/// second smearing applies frequency adjustments to the clock to speed up +/// or slow down the clock to account for the leap second without +/// discontinuities in the clock. If leap second smearing is not applied, +/// the clock will experience discontinuity around leap second adjustment. +pub struct RealTime; + +impl ClockSource for RealTime { + const ID: bindings::clockid_t = bindings::CLOCK_REALTIME as bindings::clockid_t; + + fn ktime_get() -> bindings::ktime_t { + // SAFETY: It is always safe to call `ktime_get_real()` outside of NMI context. + unsafe { bindings::ktime_get_real() } + } +} + +/// A monotonic that ticks while system is suspended. +/// +/// A nonsettable system-wide clock that is identical to CLOCK_MONOTONIC, +/// except that it also includes any time that the system is suspended. This +/// allows applications to get a suspend-aware monotonic clock without +/// having to deal with the complications of CLOCK_REALTIME, which may have +/// discontinuities if the time is changed using settimeofday(2) or similar. +pub struct BootTime; + +impl ClockSource for BootTime { + const ID: bindings::clockid_t = bindings::CLOCK_BOOTTIME as bindings::clockid_t; + + fn ktime_get() -> bindings::ktime_t { + // SAFETY: It is always safe to call `ktime_get_boottime()` outside of NMI context. + unsafe { bindings::ktime_get_boottime() } + } +} + +/// International Atomic Time. +/// +/// A system-wide clock derived from wall-clock time but counting leap seconds. +/// +/// This clock is coupled to CLOCK_REALTIME and will be set when CLOCK_REALTIME is +/// set, or when the offset to CLOCK_REALTIME is changed via adjtimex(2). This +/// usually happens during boot and **should** not happen during normal operations. +/// However, if NTP or another application adjusts CLOCK_REALTIME by leap second +/// smearing, this clock will not be precise during leap second smearing. +/// +/// The acronym TAI refers to International Atomic Time. +pub struct Tai; + +impl ClockSource for Tai { + const ID: bindings::clockid_t = bindings::CLOCK_TAI as bindings::clockid_t; + + fn ktime_get() -> bindings::ktime_t { + // SAFETY: It is always safe to call `ktime_get_tai()` outside of NMI context. + unsafe { bindings::ktime_get_clocktai() } + } +} + /// A specific point in time. /// /// # Invariants /// /// The `inner` value is in the range from 0 to `KTIME_MAX`. #[repr(transparent)] -#[derive(Copy, Clone, PartialEq, PartialOrd, Eq, Ord)] -pub struct Instant { +#[derive(PartialEq, PartialOrd, Eq, Ord)] +pub struct Instant<C: ClockSource> { inner: bindings::ktime_t, + _c: PhantomData<C>, +} + +impl<C: ClockSource> Clone for Instant<C> { + fn clone(&self) -> Self { + *self + } } -impl Instant { - /// Get the current time using `CLOCK_MONOTONIC`. +impl<C: ClockSource> Copy for Instant<C> {} + +impl<C: ClockSource> Instant<C> { + /// Get the current time from the clock source. #[inline] pub fn now() -> Self { - // INVARIANT: The `ktime_get()` function returns a value in the range + // INVARIANT: The `ClockSource::ktime_get()` function returns a value in the range // from 0 to `KTIME_MAX`. Self { - // SAFETY: It is always safe to call `ktime_get()` outside of NMI context. - inner: unsafe { bindings::ktime_get() }, + inner: C::ktime_get(), + _c: PhantomData, } } @@ -77,86 +195,25 @@ impl Instant { pub fn elapsed(&self) -> Delta { Self::now() - *self } + + #[inline] + pub(crate) fn as_nanos(&self) -> i64 { + self.inner + } } -impl core::ops::Sub for Instant { +impl<C: ClockSource> core::ops::Sub for Instant<C> { type Output = Delta; // By the type invariant, it never overflows. #[inline] - fn sub(self, other: Instant) -> Delta { + fn sub(self, other: Instant<C>) -> Delta { Delta { nanos: self.inner - other.inner, } } } -/// An identifier for a clock. Used when specifying clock sources. -/// -/// -/// Selection of the clock depends on the use case. In some cases the usage of a -/// particular clock is mandatory, e.g. in network protocols, filesystems.In other -/// cases the user of the clock has to decide which clock is best suited for the -/// purpose. In most scenarios clock [`ClockId::Monotonic`] is the best choice as it -/// provides a accurate monotonic notion of time (leap second smearing ignored). -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -#[repr(u32)] -pub enum ClockId { - /// A settable system-wide clock that measures real (i.e., wall-clock) time. - /// - /// Setting this clock requires appropriate privileges. This clock is - /// affected by discontinuous jumps in the system time (e.g., if the system - /// administrator manually changes the clock), and by frequency adjustments - /// performed by NTP and similar applications via adjtime(3), adjtimex(2), - /// clock_adjtime(2), and ntp_adjtime(3). This clock normally counts the - /// number of seconds since 1970-01-01 00:00:00 Coordinated Universal Time - /// (UTC) except that it ignores leap seconds; near a leap second it may be - /// adjusted by leap second smearing to stay roughly in sync with UTC. Leap - /// second smearing applies frequency adjustments to the clock to speed up - /// or slow down the clock to account for the leap second without - /// discontinuities in the clock. If leap second smearing is not applied, - /// the clock will experience discontinuity around leap second adjustment. - RealTime = bindings::CLOCK_REALTIME, - /// A monotonically increasing clock. - /// - /// A nonsettable system-wide clock that represents monotonic time since—as - /// described by POSIX—"some unspecified point in the past". On Linux, that - /// point corresponds to the number of seconds that the system has been - /// running since it was booted. - /// - /// The CLOCK_MONOTONIC clock is not affected by discontinuous jumps in the - /// CLOCK_REAL (e.g., if the system administrator manually changes the - /// clock), but is affected by frequency adjustments. This clock does not - /// count time that the system is suspended. - Monotonic = bindings::CLOCK_MONOTONIC, - /// A monotonic that ticks while system is suspended. - /// - /// A nonsettable system-wide clock that is identical to CLOCK_MONOTONIC, - /// except that it also includes any time that the system is suspended. This - /// allows applications to get a suspend-aware monotonic clock without - /// having to deal with the complications of CLOCK_REALTIME, which may have - /// discontinuities if the time is changed using settimeofday(2) or similar. - BootTime = bindings::CLOCK_BOOTTIME, - /// International Atomic Time. - /// - /// A system-wide clock derived from wall-clock time but counting leap seconds. - /// - /// This clock is coupled to CLOCK_REALTIME and will be set when CLOCK_REALTIME is - /// set, or when the offset to CLOCK_REALTIME is changed via adjtimex(2). This - /// usually happens during boot and **should** not happen during normal operations. - /// However, if NTP or another application adjusts CLOCK_REALTIME by leap second - /// smearing, this clock will not be precise during leap second smearing. - /// - /// The acronym TAI refers to International Atomic Time. - TAI = bindings::CLOCK_TAI, -} - -impl ClockId { - fn into_c(self) -> bindings::clockid_t { - self as bindings::clockid_t - } -} - /// A span of time. /// /// This struct represents a span of time, with its value stored as nanoseconds. @@ -228,13 +285,31 @@ impl Delta { /// Return the smallest number of microseconds greater than or equal /// to the value in the [`Delta`]. #[inline] - pub const fn as_micros_ceil(self) -> i64 { - self.as_nanos().saturating_add(NSEC_PER_USEC - 1) / NSEC_PER_USEC + pub fn as_micros_ceil(self) -> i64 { + #[cfg(CONFIG_64BIT)] + { + self.as_nanos().saturating_add(NSEC_PER_USEC - 1) / NSEC_PER_USEC + } + + #[cfg(not(CONFIG_64BIT))] + // SAFETY: It is always safe to call `ktime_to_us()` with any value. + unsafe { + bindings::ktime_to_us(self.as_nanos().saturating_add(NSEC_PER_USEC - 1)) + } } /// Return the number of milliseconds in the [`Delta`]. #[inline] - pub const fn as_millis(self) -> i64 { - self.as_nanos() / NSEC_PER_MSEC + pub fn as_millis(self) -> i64 { + #[cfg(CONFIG_64BIT)] + { + self.as_nanos() / NSEC_PER_MSEC + } + + #[cfg(not(CONFIG_64BIT))] + // SAFETY: It is always safe to call `ktime_to_ms()` with any value. + unsafe { + bindings::ktime_to_ms(self.as_nanos()) + } } } diff --git a/rust/kernel/time/delay.rs b/rust/kernel/time/delay.rs new file mode 100644 index 000000000000..eb8838da62bc --- /dev/null +++ b/rust/kernel/time/delay.rs @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Delay and sleep primitives. +//! +//! This module contains the kernel APIs related to delay and sleep that +//! have been ported or wrapped for usage by Rust code in the kernel. +//! +//! C header: [`include/linux/delay.h`](srctree/include/linux/delay.h). + +use super::Delta; +use crate::prelude::*; + +/// Sleeps for a given duration at least. +/// +/// Equivalent to the C side [`fsleep()`], flexible sleep function, +/// which automatically chooses the best sleep method based on a duration. +/// +/// `delta` must be within `[0, i32::MAX]` microseconds; +/// otherwise, it is erroneous behavior. That is, it is considered a bug +/// to call this function with an out-of-range value, in which case the function +/// will sleep for at least the maximum value in the range and may warn +/// in the future. +/// +/// The behavior above differs from the C side [`fsleep()`] for which out-of-range +/// values mean "infinite timeout" instead. +/// +/// This function can only be used in a nonatomic context. +/// +/// [`fsleep()`]: https://docs.kernel.org/timers/delay_sleep_functions.html#c.fsleep +pub fn fsleep(delta: Delta) { + // The maximum value is set to `i32::MAX` microseconds to prevent integer + // overflow inside fsleep, which could lead to unintentional infinite sleep. + const MAX_DELTA: Delta = Delta::from_micros(i32::MAX as i64); + + let delta = if (Delta::ZERO..=MAX_DELTA).contains(&delta) { + delta + } else { + // TODO: Add WARN_ONCE() when it's supported. + MAX_DELTA + }; + + // SAFETY: It is always safe to call `fsleep()` with any duration. + unsafe { + // Convert the duration to microseconds and round up to preserve + // the guarantee; `fsleep()` sleeps for at least the provided duration, + // but that it may sleep for longer under some circumstances. + bindings::fsleep(delta.as_micros_ceil() as c_ulong) + } +} diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index 36e1290cd079..144e3b57cc78 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -67,27 +67,11 @@ //! A `restart` operation on a timer in the **stopped** state is equivalent to a //! `start` operation. -use super::ClockId; +use super::{ClockSource, Delta, Instant}; use crate::{prelude::*, types::Opaque}; use core::marker::PhantomData; use pin_init::PinInit; -/// A Rust wrapper around a `ktime_t`. -// NOTE: Ktime is going to be removed when hrtimer is converted to Instant/Delta. -#[repr(transparent)] -#[derive(Copy, Clone, PartialEq, PartialOrd, Eq, Ord)] -pub struct Ktime { - inner: bindings::ktime_t, -} - -impl Ktime { - /// Returns the number of nanoseconds. - #[inline] - pub fn to_ns(self) -> i64 { - self.inner - } -} - /// A timer backed by a C `struct hrtimer`. /// /// # Invariants @@ -98,7 +82,6 @@ impl Ktime { pub struct HrTimer<T> { #[pin] timer: Opaque<bindings::hrtimer>, - mode: HrTimerMode, _t: PhantomData<T>, } @@ -112,9 +95,10 @@ unsafe impl<T> Sync for HrTimer<T> {} impl<T> HrTimer<T> { /// Return an initializer for a new timer instance. - pub fn new(mode: HrTimerMode, clock: ClockId) -> impl PinInit<Self> + pub fn new() -> impl PinInit<Self> where T: HrTimerCallback, + T: HasHrTimer<T>, { pin_init!(Self { // INVARIANT: We initialize `timer` with `hrtimer_setup` below. @@ -126,12 +110,11 @@ impl<T> HrTimer<T> { bindings::hrtimer_setup( place, Some(T::Pointer::run), - clock.into_c(), - mode.into_c(), + <<T as HasHrTimer<T>>::TimerMode as HrTimerMode>::Clock::ID, + <T as HasHrTimer<T>>::TimerMode::C_MODE, ); } }), - mode: mode, _t: PhantomData, }) } @@ -148,7 +131,7 @@ impl<T> HrTimer<T> { // SAFETY: The field projection to `timer` does not go out of bounds, // because the caller of this function promises that `this` points to an // allocation of at least the size of `Self`. - unsafe { Opaque::raw_get(core::ptr::addr_of!((*this).timer)) } + unsafe { Opaque::cast_into(core::ptr::addr_of!((*this).timer)) } } /// Cancel an initialized and potentially running timer. @@ -193,6 +176,11 @@ impl<T> HrTimer<T> { /// exist. A timer can be manipulated through any of the handles, and a handle /// may represent a cancelled timer. pub trait HrTimerPointer: Sync + Sized { + /// The operational mode associated with this timer. + /// + /// This defines how the expiration value is interpreted. + type TimerMode: HrTimerMode; + /// A handle representing a started or restarted timer. /// /// If the timer is running or if the timer callback is executing when the @@ -205,7 +193,7 @@ pub trait HrTimerPointer: Sync + Sized { /// Start the timer with expiry after `expires` time units. If the timer was /// already running, it is restarted with the new expiry time. - fn start(self, expires: Ktime) -> Self::TimerHandle; + fn start(self, expires: <Self::TimerMode as HrTimerMode>::Expires) -> Self::TimerHandle; } /// Unsafe version of [`HrTimerPointer`] for situations where leaking the @@ -220,6 +208,11 @@ pub trait HrTimerPointer: Sync + Sized { /// [`UnsafeHrTimerPointer`] outlives any associated [`HrTimerPointer::TimerHandle`] /// instances. pub unsafe trait UnsafeHrTimerPointer: Sync + Sized { + /// The operational mode associated with this timer. + /// + /// This defines how the expiration value is interpreted. + type TimerMode: HrTimerMode; + /// A handle representing a running timer. /// /// # Safety @@ -236,7 +229,7 @@ pub unsafe trait UnsafeHrTimerPointer: Sync + Sized { /// /// Caller promises keep the timer structure alive until the timer is dead. /// Caller can ensure this by not leaking the returned [`Self::TimerHandle`]. - unsafe fn start(self, expires: Ktime) -> Self::TimerHandle; + unsafe fn start(self, expires: <Self::TimerMode as HrTimerMode>::Expires) -> Self::TimerHandle; } /// A trait for stack allocated timers. @@ -246,9 +239,14 @@ pub unsafe trait UnsafeHrTimerPointer: Sync + Sized { /// Implementers must ensure that `start_scoped` does not return until the /// timer is dead and the timer handler is not running. pub unsafe trait ScopedHrTimerPointer { + /// The operational mode associated with this timer. + /// + /// This defines how the expiration value is interpreted. + type TimerMode: HrTimerMode; + /// Start the timer to run after `expires` time units and immediately /// after call `f`. When `f` returns, the timer is cancelled. - fn start_scoped<T, F>(self, expires: Ktime, f: F) -> T + fn start_scoped<T, F>(self, expires: <Self::TimerMode as HrTimerMode>::Expires, f: F) -> T where F: FnOnce() -> T; } @@ -260,7 +258,13 @@ unsafe impl<T> ScopedHrTimerPointer for T where T: UnsafeHrTimerPointer, { - fn start_scoped<U, F>(self, expires: Ktime, f: F) -> U + type TimerMode = T::TimerMode; + + fn start_scoped<U, F>( + self, + expires: <<T as UnsafeHrTimerPointer>::TimerMode as HrTimerMode>::Expires, + f: F, + ) -> U where F: FnOnce() -> U, { @@ -335,6 +339,11 @@ pub unsafe trait HrTimerHandle { /// their documentation. All the methods of this trait must operate on the same /// field. pub unsafe trait HasHrTimer<T> { + /// The operational mode associated with this timer. + /// + /// This defines how the expiration value is interpreted. + type TimerMode: HrTimerMode; + /// Return a pointer to the [`HrTimer`] within `Self`. /// /// This function is useful to get access to the value without creating @@ -382,14 +391,14 @@ pub unsafe trait HasHrTimer<T> { /// - `this` must point to a valid `Self`. /// - Caller must ensure that the pointee of `this` lives until the timer /// fires or is canceled. - unsafe fn start(this: *const Self, expires: Ktime) { + unsafe fn start(this: *const Self, expires: <Self::TimerMode as HrTimerMode>::Expires) { // SAFETY: By function safety requirement, `this` is a valid `Self`. unsafe { bindings::hrtimer_start_range_ns( Self::c_timer_ptr(this).cast_mut(), - expires.to_ns(), + expires.as_nanos(), 0, - (*Self::raw_get_timer(this)).mode.into_c(), + <Self::TimerMode as HrTimerMode>::C_MODE, ); } } @@ -411,80 +420,171 @@ impl HrTimerRestart { } } -/// Operational mode of [`HrTimer`]. -// NOTE: Some of these have the same encoding on the C side, so we keep -// `repr(Rust)` and convert elsewhere. -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum HrTimerMode { - /// Timer expires at the given expiration time. - Absolute, - /// Timer expires after the given expiration time interpreted as a duration from now. - Relative, - /// Timer does not move between CPU cores. - Pinned, - /// Timer handler is executed in soft irq context. - Soft, - /// Timer handler is executed in hard irq context. - Hard, - /// Timer expires at the given expiration time. - /// Timer does not move between CPU cores. - AbsolutePinned, - /// Timer expires after the given expiration time interpreted as a duration from now. - /// Timer does not move between CPU cores. - RelativePinned, - /// Timer expires at the given expiration time. - /// Timer handler is executed in soft irq context. - AbsoluteSoft, - /// Timer expires after the given expiration time interpreted as a duration from now. - /// Timer handler is executed in soft irq context. - RelativeSoft, - /// Timer expires at the given expiration time. - /// Timer does not move between CPU cores. - /// Timer handler is executed in soft irq context. - AbsolutePinnedSoft, - /// Timer expires after the given expiration time interpreted as a duration from now. - /// Timer does not move between CPU cores. - /// Timer handler is executed in soft irq context. - RelativePinnedSoft, - /// Timer expires at the given expiration time. - /// Timer handler is executed in hard irq context. - AbsoluteHard, - /// Timer expires after the given expiration time interpreted as a duration from now. - /// Timer handler is executed in hard irq context. - RelativeHard, - /// Timer expires at the given expiration time. - /// Timer does not move between CPU cores. - /// Timer handler is executed in hard irq context. - AbsolutePinnedHard, - /// Timer expires after the given expiration time interpreted as a duration from now. - /// Timer does not move between CPU cores. - /// Timer handler is executed in hard irq context. - RelativePinnedHard, +/// Time representations that can be used as expiration values in [`HrTimer`]. +pub trait HrTimerExpires { + /// Converts the expiration time into a nanosecond representation. + /// + /// This value corresponds to a raw ktime_t value, suitable for passing to kernel + /// timer functions. The interpretation (absolute vs relative) depends on the + /// associated [HrTimerMode] in use. + fn as_nanos(&self) -> i64; } -impl HrTimerMode { - fn into_c(self) -> bindings::hrtimer_mode { - use bindings::*; - match self { - HrTimerMode::Absolute => hrtimer_mode_HRTIMER_MODE_ABS, - HrTimerMode::Relative => hrtimer_mode_HRTIMER_MODE_REL, - HrTimerMode::Pinned => hrtimer_mode_HRTIMER_MODE_PINNED, - HrTimerMode::Soft => hrtimer_mode_HRTIMER_MODE_SOFT, - HrTimerMode::Hard => hrtimer_mode_HRTIMER_MODE_HARD, - HrTimerMode::AbsolutePinned => hrtimer_mode_HRTIMER_MODE_ABS_PINNED, - HrTimerMode::RelativePinned => hrtimer_mode_HRTIMER_MODE_REL_PINNED, - HrTimerMode::AbsoluteSoft => hrtimer_mode_HRTIMER_MODE_ABS_SOFT, - HrTimerMode::RelativeSoft => hrtimer_mode_HRTIMER_MODE_REL_SOFT, - HrTimerMode::AbsolutePinnedSoft => hrtimer_mode_HRTIMER_MODE_ABS_PINNED_SOFT, - HrTimerMode::RelativePinnedSoft => hrtimer_mode_HRTIMER_MODE_REL_PINNED_SOFT, - HrTimerMode::AbsoluteHard => hrtimer_mode_HRTIMER_MODE_ABS_HARD, - HrTimerMode::RelativeHard => hrtimer_mode_HRTIMER_MODE_REL_HARD, - HrTimerMode::AbsolutePinnedHard => hrtimer_mode_HRTIMER_MODE_ABS_PINNED_HARD, - HrTimerMode::RelativePinnedHard => hrtimer_mode_HRTIMER_MODE_REL_PINNED_HARD, - } +impl<C: ClockSource> HrTimerExpires for Instant<C> { + #[inline] + fn as_nanos(&self) -> i64 { + Instant::<C>::as_nanos(self) + } +} + +impl HrTimerExpires for Delta { + #[inline] + fn as_nanos(&self) -> i64 { + Delta::as_nanos(*self) } } +mod private { + use crate::time::ClockSource; + + pub trait Sealed {} + + impl<C: ClockSource> Sealed for super::AbsoluteMode<C> {} + impl<C: ClockSource> Sealed for super::RelativeMode<C> {} + impl<C: ClockSource> Sealed for super::AbsolutePinnedMode<C> {} + impl<C: ClockSource> Sealed for super::RelativePinnedMode<C> {} + impl<C: ClockSource> Sealed for super::AbsoluteSoftMode<C> {} + impl<C: ClockSource> Sealed for super::RelativeSoftMode<C> {} + impl<C: ClockSource> Sealed for super::AbsolutePinnedSoftMode<C> {} + impl<C: ClockSource> Sealed for super::RelativePinnedSoftMode<C> {} + impl<C: ClockSource> Sealed for super::AbsoluteHardMode<C> {} + impl<C: ClockSource> Sealed for super::RelativeHardMode<C> {} + impl<C: ClockSource> Sealed for super::AbsolutePinnedHardMode<C> {} + impl<C: ClockSource> Sealed for super::RelativePinnedHardMode<C> {} +} + +/// Operational mode of [`HrTimer`]. +pub trait HrTimerMode: private::Sealed { + /// The C representation of hrtimer mode. + const C_MODE: bindings::hrtimer_mode; + + /// Type representing the clock source. + type Clock: ClockSource; + + /// Type representing the expiration specification (absolute or relative time). + type Expires: HrTimerExpires; +} + +/// Timer that expires at a fixed point in time. +pub struct AbsoluteMode<C: ClockSource>(PhantomData<C>); + +impl<C: ClockSource> HrTimerMode for AbsoluteMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_ABS; + + type Clock = C; + type Expires = Instant<C>; +} + +/// Timer that expires after a delay from now. +pub struct RelativeMode<C: ClockSource>(PhantomData<C>); + +impl<C: ClockSource> HrTimerMode for RelativeMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_REL; + + type Clock = C; + type Expires = Delta; +} + +/// Timer with absolute expiration time, pinned to its current CPU. +pub struct AbsolutePinnedMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for AbsolutePinnedMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_ABS_PINNED; + + type Clock = C; + type Expires = Instant<C>; +} + +/// Timer with relative expiration time, pinned to its current CPU. +pub struct RelativePinnedMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for RelativePinnedMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_REL_PINNED; + + type Clock = C; + type Expires = Delta; +} + +/// Timer with absolute expiration, handled in soft irq context. +pub struct AbsoluteSoftMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for AbsoluteSoftMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_ABS_SOFT; + + type Clock = C; + type Expires = Instant<C>; +} + +/// Timer with relative expiration, handled in soft irq context. +pub struct RelativeSoftMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for RelativeSoftMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_REL_SOFT; + + type Clock = C; + type Expires = Delta; +} + +/// Timer with absolute expiration, pinned to CPU and handled in soft irq context. +pub struct AbsolutePinnedSoftMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for AbsolutePinnedSoftMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_ABS_PINNED_SOFT; + + type Clock = C; + type Expires = Instant<C>; +} + +/// Timer with absolute expiration, pinned to CPU and handled in soft irq context. +pub struct RelativePinnedSoftMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for RelativePinnedSoftMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_REL_PINNED_SOFT; + + type Clock = C; + type Expires = Delta; +} + +/// Timer with absolute expiration, handled in hard irq context. +pub struct AbsoluteHardMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for AbsoluteHardMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_ABS_HARD; + + type Clock = C; + type Expires = Instant<C>; +} + +/// Timer with relative expiration, handled in hard irq context. +pub struct RelativeHardMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for RelativeHardMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_REL_HARD; + + type Clock = C; + type Expires = Delta; +} + +/// Timer with absolute expiration, pinned to CPU and handled in hard irq context. +pub struct AbsolutePinnedHardMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for AbsolutePinnedHardMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_ABS_PINNED_HARD; + + type Clock = C; + type Expires = Instant<C>; +} + +/// Timer with relative expiration, pinned to CPU and handled in hard irq context. +pub struct RelativePinnedHardMode<C: ClockSource>(PhantomData<C>); +impl<C: ClockSource> HrTimerMode for RelativePinnedHardMode<C> { + const C_MODE: bindings::hrtimer_mode = bindings::hrtimer_mode_HRTIMER_MODE_REL_PINNED_HARD; + + type Clock = C; + type Expires = Delta; +} + /// Use to implement the [`HasHrTimer<T>`] trait. /// /// See [`module`] documentation for an example. @@ -496,12 +596,16 @@ macro_rules! impl_has_hr_timer { impl$({$($generics:tt)*})? HasHrTimer<$timer_type:ty> for $self:ty - { self.$field:ident } + { + mode : $mode:ty, + field : self.$field:ident $(,)? + } $($rest:tt)* ) => { // SAFETY: This implementation of `raw_get_timer` only compiles if the // field has the right type. unsafe impl$(<$($generics)*>)? $crate::time::hrtimer::HasHrTimer<$timer_type> for $self { + type TimerMode = $mode; #[inline] unsafe fn raw_get_timer( diff --git a/rust/kernel/time/hrtimer/arc.rs b/rust/kernel/time/hrtimer/arc.rs index ccf1e66e5b2d..ed490a7a8950 100644 --- a/rust/kernel/time/hrtimer/arc.rs +++ b/rust/kernel/time/hrtimer/arc.rs @@ -4,8 +4,8 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; use super::HrTimerHandle; +use super::HrTimerMode; use super::HrTimerPointer; -use super::Ktime; use super::RawHrTimerCallback; use crate::sync::Arc; use crate::sync::ArcBorrow; @@ -54,9 +54,13 @@ where T: HasHrTimer<T>, T: for<'a> HrTimerCallback<Pointer<'a> = Self>, { + type TimerMode = <T as HasHrTimer<T>>::TimerMode; type TimerHandle = ArcHrTimerHandle<T>; - fn start(self, expires: Ktime) -> ArcHrTimerHandle<T> { + fn start( + self, + expires: <<T as HasHrTimer<T>>::TimerMode as HrTimerMode>::Expires, + ) -> ArcHrTimerHandle<T> { // SAFETY: // - We keep `self` alive by wrapping it in a handle below. // - Since we generate the pointer passed to `start` from a valid diff --git a/rust/kernel/time/hrtimer/pin.rs b/rust/kernel/time/hrtimer/pin.rs index 293ca9cf058c..aef16d9ee2f0 100644 --- a/rust/kernel/time/hrtimer/pin.rs +++ b/rust/kernel/time/hrtimer/pin.rs @@ -4,7 +4,7 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; use super::HrTimerHandle; -use super::Ktime; +use super::HrTimerMode; use super::RawHrTimerCallback; use super::UnsafeHrTimerPointer; use core::pin::Pin; @@ -54,9 +54,13 @@ where T: HasHrTimer<T>, T: HrTimerCallback<Pointer<'a> = Self>, { + type TimerMode = <T as HasHrTimer<T>>::TimerMode; type TimerHandle = PinHrTimerHandle<'a, T>; - unsafe fn start(self, expires: Ktime) -> Self::TimerHandle { + unsafe fn start( + self, + expires: <<T as HasHrTimer<T>>::TimerMode as HrTimerMode>::Expires, + ) -> Self::TimerHandle { // Cast to pointer let self_ptr: *const T = self.get_ref(); @@ -79,7 +83,7 @@ where unsafe extern "C" fn run(ptr: *mut bindings::hrtimer) -> bindings::hrtimer_restart { // `HrTimer` is `repr(C)` - let timer_ptr = ptr as *mut HrTimer<T>; + let timer_ptr = ptr.cast::<HrTimer<T>>(); // SAFETY: By the safety requirement of this function, `timer_ptr` // points to a `HrTimer<T>` contained in an `T`. diff --git a/rust/kernel/time/hrtimer/pin_mut.rs b/rust/kernel/time/hrtimer/pin_mut.rs index 6033572d35ad..767d0a4e8a2c 100644 --- a/rust/kernel/time/hrtimer/pin_mut.rs +++ b/rust/kernel/time/hrtimer/pin_mut.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 use super::{ - HasHrTimer, HrTimer, HrTimerCallback, HrTimerHandle, Ktime, RawHrTimerCallback, + HasHrTimer, HrTimer, HrTimerCallback, HrTimerHandle, HrTimerMode, RawHrTimerCallback, UnsafeHrTimerPointer, }; use core::{marker::PhantomData, pin::Pin, ptr::NonNull}; @@ -52,9 +52,13 @@ where T: HasHrTimer<T>, T: HrTimerCallback<Pointer<'a> = Self>, { + type TimerMode = <T as HasHrTimer<T>>::TimerMode; type TimerHandle = PinMutHrTimerHandle<'a, T>; - unsafe fn start(mut self, expires: Ktime) -> Self::TimerHandle { + unsafe fn start( + mut self, + expires: <<T as HasHrTimer<T>>::TimerMode as HrTimerMode>::Expires, + ) -> Self::TimerHandle { // SAFETY: // - We promise not to move out of `self`. We only pass `self` // back to the caller as a `Pin<&mut self>`. @@ -83,7 +87,7 @@ where unsafe extern "C" fn run(ptr: *mut bindings::hrtimer) -> bindings::hrtimer_restart { // `HrTimer` is `repr(C)` - let timer_ptr = ptr as *mut HrTimer<T>; + let timer_ptr = ptr.cast::<HrTimer<T>>(); // SAFETY: By the safety requirement of this function, `timer_ptr` // points to a `HrTimer<T>` contained in an `T`. diff --git a/rust/kernel/time/hrtimer/tbox.rs b/rust/kernel/time/hrtimer/tbox.rs index 29526a5da203..ec08303315f2 100644 --- a/rust/kernel/time/hrtimer/tbox.rs +++ b/rust/kernel/time/hrtimer/tbox.rs @@ -4,8 +4,8 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; use super::HrTimerHandle; +use super::HrTimerMode; use super::HrTimerPointer; -use super::Ktime; use super::RawHrTimerCallback; use crate::prelude::*; use core::ptr::NonNull; @@ -64,9 +64,13 @@ where T: for<'a> HrTimerCallback<Pointer<'a> = Pin<Box<T, A>>>, A: crate::alloc::Allocator, { + type TimerMode = <T as HasHrTimer<T>>::TimerMode; type TimerHandle = BoxHrTimerHandle<T, A>; - fn start(self, expires: Ktime) -> Self::TimerHandle { + fn start( + self, + expires: <<T as HasHrTimer<T>>::TimerMode as HrTimerMode>::Expires, + ) -> Self::TimerHandle { // SAFETY: // - We will not move out of this box during timer callback (we pass an // immutable reference to the callback). diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index 3958a5f44d56..dc0a02f5c3cf 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -2,15 +2,17 @@ //! Kernel types. +use crate::ffi::c_void; use core::{ cell::UnsafeCell, marker::{PhantomData, PhantomPinned}, - mem::{ManuallyDrop, MaybeUninit}, + mem::MaybeUninit, ops::{Deref, DerefMut}, - ptr::NonNull, }; use pin_init::{PinInit, Wrapper, Zeroable}; +pub use crate::sync::aref::{ARef, AlwaysRefCounted}; + /// Used to transfer ownership to and from foreign (non-Rust) languages. /// /// Ownership is transferred from Rust to a foreign language by calling [`Self::into_foreign`] and @@ -21,15 +23,10 @@ use pin_init::{PinInit, Wrapper, Zeroable}; /// /// # Safety /// -/// Implementers must ensure that [`into_foreign`] returns a pointer which meets the alignment -/// requirements of [`PointedTo`]. -/// -/// [`into_foreign`]: Self::into_foreign -/// [`PointedTo`]: Self::PointedTo +/// - Implementations must satisfy the guarantees of [`Self::into_foreign`]. pub unsafe trait ForeignOwnable: Sized { - /// Type used when the value is foreign-owned. In practical terms only defines the alignment of - /// the pointer. - type PointedTo; + /// The alignment of pointers returned by `into_foreign`. + const FOREIGN_ALIGN: usize; /// Type used to immutably borrow a value that is currently foreign-owned. type Borrowed<'a>; @@ -39,18 +36,21 @@ pub unsafe trait ForeignOwnable: Sized { /// Converts a Rust-owned object to a foreign-owned one. /// + /// The foreign representation is a pointer to void. Aside from the guarantees listed below, + /// there are no other guarantees for this pointer. For example, it might be invalid, dangling + /// or pointing to uninitialized memory. Using it in any way except for [`from_foreign`], + /// [`try_from_foreign`], [`borrow`], or [`borrow_mut`] can result in undefined behavior. + /// /// # Guarantees /// - /// The return value is guaranteed to be well-aligned, but there are no other guarantees for - /// this pointer. For example, it might be null, dangling, or point to uninitialized memory. - /// Using it in any way except for [`ForeignOwnable::from_foreign`], [`ForeignOwnable::borrow`], - /// [`ForeignOwnable::try_from_foreign`] can result in undefined behavior. + /// - Minimum alignment of returned pointer is [`Self::FOREIGN_ALIGN`]. + /// - The returned pointer is not null. /// /// [`from_foreign`]: Self::from_foreign /// [`try_from_foreign`]: Self::try_from_foreign /// [`borrow`]: Self::borrow /// [`borrow_mut`]: Self::borrow_mut - fn into_foreign(self) -> *mut Self::PointedTo; + fn into_foreign(self) -> *mut c_void; /// Converts a foreign-owned object back to a Rust-owned one. /// @@ -60,7 +60,7 @@ pub unsafe trait ForeignOwnable: Sized { /// must not be passed to `from_foreign` more than once. /// /// [`into_foreign`]: Self::into_foreign - unsafe fn from_foreign(ptr: *mut Self::PointedTo) -> Self; + unsafe fn from_foreign(ptr: *mut c_void) -> Self; /// Tries to convert a foreign-owned object back to a Rust-owned one. /// @@ -72,7 +72,7 @@ pub unsafe trait ForeignOwnable: Sized { /// `ptr` must either be null or satisfy the safety requirements for [`from_foreign`]. /// /// [`from_foreign`]: Self::from_foreign - unsafe fn try_from_foreign(ptr: *mut Self::PointedTo) -> Option<Self> { + unsafe fn try_from_foreign(ptr: *mut c_void) -> Option<Self> { if ptr.is_null() { None } else { @@ -95,7 +95,7 @@ pub unsafe trait ForeignOwnable: Sized { /// /// [`into_foreign`]: Self::into_foreign /// [`from_foreign`]: Self::from_foreign - unsafe fn borrow<'a>(ptr: *mut Self::PointedTo) -> Self::Borrowed<'a>; + unsafe fn borrow<'a>(ptr: *mut c_void) -> Self::Borrowed<'a>; /// Borrows a foreign-owned object mutably. /// @@ -123,23 +123,24 @@ pub unsafe trait ForeignOwnable: Sized { /// [`from_foreign`]: Self::from_foreign /// [`borrow`]: Self::borrow /// [`Arc`]: crate::sync::Arc - unsafe fn borrow_mut<'a>(ptr: *mut Self::PointedTo) -> Self::BorrowedMut<'a>; + unsafe fn borrow_mut<'a>(ptr: *mut c_void) -> Self::BorrowedMut<'a>; } -// SAFETY: The `into_foreign` function returns a pointer that is dangling, but well-aligned. +// SAFETY: The pointer returned by `into_foreign` comes from a well aligned +// pointer to `()`. unsafe impl ForeignOwnable for () { - type PointedTo = (); + const FOREIGN_ALIGN: usize = core::mem::align_of::<()>(); type Borrowed<'a> = (); type BorrowedMut<'a> = (); - fn into_foreign(self) -> *mut Self::PointedTo { + fn into_foreign(self) -> *mut c_void { core::ptr::NonNull::dangling().as_ptr() } - unsafe fn from_foreign(_: *mut Self::PointedTo) -> Self {} + unsafe fn from_foreign(_: *mut c_void) -> Self {} - unsafe fn borrow<'a>(_: *mut Self::PointedTo) -> Self::Borrowed<'a> {} - unsafe fn borrow_mut<'a>(_: *mut Self::PointedTo) -> Self::BorrowedMut<'a> {} + unsafe fn borrow<'a>(_: *mut c_void) -> Self::Borrowed<'a> {} + unsafe fn borrow_mut<'a>(_: *mut c_void) -> Self::BorrowedMut<'a> {} } /// Runs a cleanup function/closure when dropped. @@ -366,7 +367,7 @@ impl<T> Opaque<T> { // initialize the `T`. unsafe { pin_init::pin_init_from_closure::<_, ::core::convert::Infallible>(move |slot| { - init_func(Self::raw_get(slot)); + init_func(Self::cast_into(slot)); Ok(()) }) } @@ -386,7 +387,7 @@ impl<T> Opaque<T> { // SAFETY: We contain a `MaybeUninit`, so it is OK for the `init_func` to not fully // initialize the `T`. unsafe { - pin_init::pin_init_from_closure::<_, E>(move |slot| init_func(Self::raw_get(slot))) + pin_init::pin_init_from_closure::<_, E>(move |slot| init_func(Self::cast_into(slot))) } } @@ -399,9 +400,14 @@ impl<T> Opaque<T> { /// /// This function is useful to get access to the value without creating intermediate /// references. - pub const fn raw_get(this: *const Self) -> *mut T { + pub const fn cast_into(this: *const Self) -> *mut T { UnsafeCell::raw_get(this.cast::<UnsafeCell<MaybeUninit<T>>>()).cast::<T>() } + + /// The opposite operation of [`Opaque::cast_into`]. + pub const fn cast_from(this: *const T) -> *const Self { + this.cast() + } } impl<T> Wrapper<T> for Opaque<T> { @@ -417,173 +423,6 @@ impl<T> Wrapper<T> for Opaque<T> { } } -/// Types that are _always_ reference counted. -/// -/// It allows such types to define their own custom ref increment and decrement functions. -/// Additionally, it allows users to convert from a shared reference `&T` to an owned reference -/// [`ARef<T>`]. -/// -/// This is usually implemented by wrappers to existing structures on the C side of the code. For -/// Rust code, the recommendation is to use [`Arc`](crate::sync::Arc) to create reference-counted -/// instances of a type. -/// -/// # Safety -/// -/// Implementers must ensure that increments to the reference count keep the object alive in memory -/// at least until matching decrements are performed. -/// -/// Implementers must also ensure that all instances are reference-counted. (Otherwise they -/// won't be able to honour the requirement that [`AlwaysRefCounted::inc_ref`] keep the object -/// alive.) -pub unsafe trait AlwaysRefCounted { - /// Increments the reference count on the object. - fn inc_ref(&self); - - /// Decrements the reference count on the object. - /// - /// Frees the object when the count reaches zero. - /// - /// # Safety - /// - /// Callers must ensure that there was a previous matching increment to the reference count, - /// and that the object is no longer used after its reference count is decremented (as it may - /// result in the object being freed), unless the caller owns another increment on the refcount - /// (e.g., it calls [`AlwaysRefCounted::inc_ref`] twice, then calls - /// [`AlwaysRefCounted::dec_ref`] once). - unsafe fn dec_ref(obj: NonNull<Self>); -} - -/// An owned reference to an always-reference-counted object. -/// -/// The object's reference count is automatically decremented when an instance of [`ARef`] is -/// dropped. It is also automatically incremented when a new instance is created via -/// [`ARef::clone`]. -/// -/// # Invariants -/// -/// The pointer stored in `ptr` is non-null and valid for the lifetime of the [`ARef`] instance. In -/// particular, the [`ARef`] instance owns an increment on the underlying object's reference count. -pub struct ARef<T: AlwaysRefCounted> { - ptr: NonNull<T>, - _p: PhantomData<T>, -} - -// SAFETY: It is safe to send `ARef<T>` to another thread when the underlying `T` is `Sync` because -// it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, it needs -// `T` to be `Send` because any thread that has an `ARef<T>` may ultimately access `T` using a -// mutable reference, for example, when the reference count reaches zero and `T` is dropped. -unsafe impl<T: AlwaysRefCounted + Sync + Send> Send for ARef<T> {} - -// SAFETY: It is safe to send `&ARef<T>` to another thread when the underlying `T` is `Sync` -// because it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, -// it needs `T` to be `Send` because any thread that has a `&ARef<T>` may clone it and get an -// `ARef<T>` on that thread, so the thread may ultimately access `T` using a mutable reference, for -// example, when the reference count reaches zero and `T` is dropped. -unsafe impl<T: AlwaysRefCounted + Sync + Send> Sync for ARef<T> {} - -impl<T: AlwaysRefCounted> ARef<T> { - /// Creates a new instance of [`ARef`]. - /// - /// It takes over an increment of the reference count on the underlying object. - /// - /// # Safety - /// - /// Callers must ensure that the reference count was incremented at least once, and that they - /// are properly relinquishing one increment. That is, if there is only one increment, callers - /// must not use the underlying object anymore -- it is only safe to do so via the newly - /// created [`ARef`]. - pub unsafe fn from_raw(ptr: NonNull<T>) -> Self { - // INVARIANT: The safety requirements guarantee that the new instance now owns the - // increment on the refcount. - Self { - ptr, - _p: PhantomData, - } - } - - /// Consumes the `ARef`, returning a raw pointer. - /// - /// This function does not change the refcount. After calling this function, the caller is - /// responsible for the refcount previously managed by the `ARef`. - /// - /// # Examples - /// - /// ``` - /// use core::ptr::NonNull; - /// use kernel::types::{ARef, AlwaysRefCounted}; - /// - /// struct Empty {} - /// - /// # // SAFETY: TODO. - /// unsafe impl AlwaysRefCounted for Empty { - /// fn inc_ref(&self) {} - /// unsafe fn dec_ref(_obj: NonNull<Self>) {} - /// } - /// - /// let mut data = Empty {}; - /// let ptr = NonNull::<Empty>::new(&mut data).unwrap(); - /// # // SAFETY: TODO. - /// let data_ref: ARef<Empty> = unsafe { ARef::from_raw(ptr) }; - /// let raw_ptr: NonNull<Empty> = ARef::into_raw(data_ref); - /// - /// assert_eq!(ptr, raw_ptr); - /// ``` - pub fn into_raw(me: Self) -> NonNull<T> { - ManuallyDrop::new(me).ptr - } -} - -impl<T: AlwaysRefCounted> Clone for ARef<T> { - fn clone(&self) -> Self { - self.inc_ref(); - // SAFETY: We just incremented the refcount above. - unsafe { Self::from_raw(self.ptr) } - } -} - -impl<T: AlwaysRefCounted> Deref for ARef<T> { - type Target = T; - - fn deref(&self) -> &Self::Target { - // SAFETY: The type invariants guarantee that the object is valid. - unsafe { self.ptr.as_ref() } - } -} - -impl<T: AlwaysRefCounted> From<&T> for ARef<T> { - fn from(b: &T) -> Self { - b.inc_ref(); - // SAFETY: We just incremented the refcount above. - unsafe { Self::from_raw(NonNull::from(b)) } - } -} - -impl<T: AlwaysRefCounted> Drop for ARef<T> { - fn drop(&mut self) { - // SAFETY: The type invariants guarantee that the `ARef` owns the reference we're about to - // decrement. - unsafe { T::dec_ref(self.ptr) }; - } -} - -/// A sum type that always holds either a value of type `L` or `R`. -/// -/// # Examples -/// -/// ``` -/// use kernel::types::Either; -/// -/// let left_value: Either<i32, &str> = Either::Left(7); -/// let right_value: Either<i32, &str> = Either::Right("right value"); -/// ``` -pub enum Either<L, R> { - /// Constructs an instance of [`Either`] containing a value of type `L`. - Left(L), - - /// Constructs an instance of [`Either`] containing a value of type `R`. - Right(R), -} - /// Zero-sized type to mark types not [`Send`]. /// /// Add this type as a field to your struct if your type should not be sent to a different task. diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs index 6d70edd8086a..a8fb4764185a 100644 --- a/rust/kernel/uaccess.rs +++ b/rust/kernel/uaccess.rs @@ -8,14 +8,57 @@ use crate::{ alloc::{Allocator, Flags}, bindings, error::Result, - ffi::c_void, + ffi::{c_char, c_void}, prelude::*, transmute::{AsBytes, FromBytes}, }; use core::mem::{size_of, MaybeUninit}; -/// The type used for userspace addresses. -pub type UserPtr = usize; +/// A pointer into userspace. +/// +/// This is the Rust equivalent to C pointers tagged with `__user`. +#[repr(transparent)] +#[derive(Copy, Clone)] +pub struct UserPtr(*mut c_void); + +impl UserPtr { + /// Create a `UserPtr` from an integer representing the userspace address. + #[inline] + pub fn from_addr(addr: usize) -> Self { + Self(addr as *mut c_void) + } + + /// Create a `UserPtr` from a pointer representing the userspace address. + #[inline] + pub fn from_ptr(addr: *mut c_void) -> Self { + Self(addr) + } + + /// Cast this userspace pointer to a raw const void pointer. + /// + /// It is up to the caller to use the returned pointer correctly. + #[inline] + pub fn as_const_ptr(self) -> *const c_void { + self.0 + } + + /// Cast this userspace pointer to a raw mutable void pointer. + /// + /// It is up to the caller to use the returned pointer correctly. + #[inline] + pub fn as_mut_ptr(self) -> *mut c_void { + self.0 + } + + /// Increment this user pointer by `add` bytes. + /// + /// This addition is wrapping, so wrapping around the address space does not result in a panic + /// even if `CONFIG_RUST_OVERFLOW_CHECKS` is enabled. + #[inline] + pub fn wrapping_byte_add(self, add: usize) -> UserPtr { + UserPtr(self.0.wrapping_byte_add(add)) + } +} /// A pointer to an area in userspace memory, which can be either read-only or read-write. /// @@ -177,7 +220,7 @@ impl UserSliceReader { pub fn skip(&mut self, num_skip: usize) -> Result { // Update `self.length` first since that's the fallible part of this operation. self.length = self.length.checked_sub(num_skip).ok_or(EFAULT)?; - self.ptr = self.ptr.wrapping_add(num_skip); + self.ptr = self.ptr.wrapping_byte_add(num_skip); Ok(()) } @@ -224,11 +267,11 @@ impl UserSliceReader { } // SAFETY: `out_ptr` points into a mutable slice of length `len`, so we may write // that many bytes to it. - let res = unsafe { bindings::copy_from_user(out_ptr, self.ptr as *const c_void, len) }; + let res = unsafe { bindings::copy_from_user(out_ptr, self.ptr.as_const_ptr(), len) }; if res != 0 { return Err(EFAULT); } - self.ptr = self.ptr.wrapping_add(len); + self.ptr = self.ptr.wrapping_byte_add(len); self.length -= len; Ok(()) } @@ -240,7 +283,7 @@ impl UserSliceReader { pub fn read_slice(&mut self, out: &mut [u8]) -> Result { // SAFETY: The types are compatible and `read_raw` doesn't write uninitialized bytes to // `out`. - let out = unsafe { &mut *(out as *mut [u8] as *mut [MaybeUninit<u8>]) }; + let out = unsafe { &mut *(core::ptr::from_mut(out) as *mut [MaybeUninit<u8>]) }; self.read_raw(out) } @@ -262,14 +305,14 @@ impl UserSliceReader { let res = unsafe { bindings::_copy_from_user( out.as_mut_ptr().cast::<c_void>(), - self.ptr as *const c_void, + self.ptr.as_const_ptr(), len, ) }; if res != 0 { return Err(EFAULT); } - self.ptr = self.ptr.wrapping_add(len); + self.ptr = self.ptr.wrapping_byte_add(len); self.length -= len; // SAFETY: The read above has initialized all bytes in `out`, and since `T` implements // `FromBytes`, any bit-pattern is a valid value for this type. @@ -291,6 +334,65 @@ impl UserSliceReader { unsafe { buf.inc_len(len) }; Ok(()) } + + /// Read a NUL-terminated string from userspace and return it. + /// + /// The string is read into `buf` and a NUL-terminator is added if the end of `buf` is reached. + /// Since there must be space to add a NUL-terminator, the buffer must not be empty. The + /// returned `&CStr` points into `buf`. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address (some data may have been + /// copied). + #[doc(alias = "strncpy_from_user")] + pub fn strcpy_into_buf<'buf>(self, buf: &'buf mut [u8]) -> Result<&'buf CStr> { + if buf.is_empty() { + return Err(EINVAL); + } + + // SAFETY: The types are compatible and `strncpy_from_user` doesn't write uninitialized + // bytes to `buf`. + let mut dst = unsafe { &mut *(core::ptr::from_mut(buf) as *mut [MaybeUninit<u8>]) }; + + // We never read more than `self.length` bytes. + if dst.len() > self.length { + dst = &mut dst[..self.length]; + } + + let mut len = raw_strncpy_from_user(dst, self.ptr)?; + if len < dst.len() { + // Add one to include the NUL-terminator. + len += 1; + } else if len < buf.len() { + // This implies that `len == dst.len() < buf.len()`. + // + // This means that we could not fill the entire buffer, but we had to stop reading + // because we hit the `self.length` limit of this `UserSliceReader`. Since we did not + // fill the buffer, we treat this case as if we tried to read past the `self.length` + // limit and received a page fault, which is consistent with other `UserSliceReader` + // methods that also return page faults when you exceed `self.length`. + return Err(EFAULT); + } else { + // This implies that `len == buf.len()`. + // + // This means that we filled the buffer exactly. In this case, we add a NUL-terminator + // and return it. Unlike the `len < dst.len()` branch, don't modify `len` because it + // already represents the length including the NUL-terminator. + // + // SAFETY: Due to the check at the beginning, the buffer is not empty. + unsafe { *buf.last_mut().unwrap_unchecked() = 0 }; + } + + // This method consumes `self`, so it can only be called once, thus we do not need to + // update `self.length`. This sidesteps concerns such as whether `self.length` should be + // incremented by `len` or `len-1` in the `len == buf.len()` case. + + // SAFETY: There are two cases: + // * If we hit the `len < dst.len()` case, then `raw_strncpy_from_user` guarantees that + // this slice contains exactly one NUL byte at the end of the string. + // * Otherwise, `raw_strncpy_from_user` guarantees that the string contained no NUL bytes, + // and we have since added a NUL byte at the end. + Ok(unsafe { CStr::from_bytes_with_nul_unchecked(&buf[..len]) }) + } } /// A writer for [`UserSlice`]. @@ -327,11 +429,11 @@ impl UserSliceWriter { } // SAFETY: `data_ptr` points into an immutable slice of length `len`, so we may read // that many bytes from it. - let res = unsafe { bindings::copy_to_user(self.ptr as *mut c_void, data_ptr, len) }; + let res = unsafe { bindings::copy_to_user(self.ptr.as_mut_ptr(), data_ptr, len) }; if res != 0 { return Err(EFAULT); } - self.ptr = self.ptr.wrapping_add(len); + self.ptr = self.ptr.wrapping_byte_add(len); self.length -= len; Ok(()) } @@ -354,16 +456,53 @@ impl UserSliceWriter { // is a compile-time constant. let res = unsafe { bindings::_copy_to_user( - self.ptr as *mut c_void, - (value as *const T).cast::<c_void>(), + self.ptr.as_mut_ptr(), + core::ptr::from_ref(value).cast::<c_void>(), len, ) }; if res != 0 { return Err(EFAULT); } - self.ptr = self.ptr.wrapping_add(len); + self.ptr = self.ptr.wrapping_byte_add(len); self.length -= len; Ok(()) } } + +/// Reads a nul-terminated string into `dst` and returns the length. +/// +/// This reads from userspace until a NUL byte is encountered, or until `dst.len()` bytes have been +/// read. Fails with [`EFAULT`] if a read happens on a bad address (some data may have been +/// copied). When the end of the buffer is encountered, no NUL byte is added, so the string is +/// *not* guaranteed to be NUL-terminated when `Ok(dst.len())` is returned. +/// +/// # Guarantees +/// +/// When this function returns `Ok(len)`, it is guaranteed that the first `len` bytes of `dst` are +/// initialized and non-zero. Furthermore, if `len < dst.len()`, then `dst[len]` is a NUL byte. +#[inline] +fn raw_strncpy_from_user(dst: &mut [MaybeUninit<u8>], src: UserPtr) -> Result<usize> { + // CAST: Slice lengths are guaranteed to be `<= isize::MAX`. + let len = dst.len() as isize; + + // SAFETY: `dst` is valid for writing `dst.len()` bytes. + let res = unsafe { + bindings::strncpy_from_user( + dst.as_mut_ptr().cast::<c_char>(), + src.as_const_ptr().cast::<c_char>(), + len, + ) + }; + + if res < 0 { + return Err(Error::from_errno(res as i32)); + } + + #[cfg(CONFIG_RUST_OVERFLOW_CHECKS)] + assert!(res <= len); + + // GUARANTEES: `strncpy_from_user` was successful, so `dst` has contents in accordance with the + // guarantees of this function. + Ok(res as usize) +} diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index d092112d843f..b9343d5bc00f 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -26,7 +26,7 @@ //! * The [`WorkItemPointer`] trait is implemented for the pointer type that points at a something //! that implements [`WorkItem`]. //! -//! ## Example +//! ## Examples //! //! This example defines a struct that holds an integer and can be scheduled on the workqueue. When //! the struct is executed, it will print the integer. Since there is only one `work_struct` field, @@ -131,10 +131,69 @@ //! # print_2_later(MyStruct::new(41, 42).unwrap()); //! ``` //! +//! This example shows how you can schedule delayed work items: +//! +//! ``` +//! use kernel::sync::Arc; +//! use kernel::workqueue::{self, impl_has_delayed_work, new_delayed_work, DelayedWork, WorkItem}; +//! +//! #[pin_data] +//! struct MyStruct { +//! value: i32, +//! #[pin] +//! work: DelayedWork<MyStruct>, +//! } +//! +//! impl_has_delayed_work! { +//! impl HasDelayedWork<Self> for MyStruct { self.work } +//! } +//! +//! impl MyStruct { +//! fn new(value: i32) -> Result<Arc<Self>> { +//! Arc::pin_init( +//! pin_init!(MyStruct { +//! value, +//! work <- new_delayed_work!("MyStruct::work"), +//! }), +//! GFP_KERNEL, +//! ) +//! } +//! } +//! +//! impl WorkItem for MyStruct { +//! type Pointer = Arc<MyStruct>; +//! +//! fn run(this: Arc<MyStruct>) { +//! pr_info!("The value is: {}\n", this.value); +//! } +//! } +//! +//! /// This method will enqueue the struct for execution on the system workqueue, where its value +//! /// will be printed 12 jiffies later. +//! fn print_later(val: Arc<MyStruct>) { +//! let _ = workqueue::system().enqueue_delayed(val, 12); +//! } +//! +//! /// It is also possible to use the ordinary `enqueue` method together with `DelayedWork`. This +//! /// is equivalent to calling `enqueue_delayed` with a delay of zero. +//! fn print_now(val: Arc<MyStruct>) { +//! let _ = workqueue::system().enqueue(val); +//! } +//! # print_later(MyStruct::new(42).unwrap()); +//! # print_now(MyStruct::new(42).unwrap()); +//! ``` +//! //! C header: [`include/linux/workqueue.h`](srctree/include/linux/workqueue.h) -use crate::alloc::{AllocError, Flags}; -use crate::{prelude::*, sync::Arc, sync::LockClassKey, types::Opaque}; +use crate::{ + alloc::{AllocError, Flags}, + container_of, + prelude::*, + sync::Arc, + sync::LockClassKey, + time::Jiffies, + types::Opaque, +}; use core::marker::PhantomData; /// Creates a [`Work`] initialiser with the given name and a newly-created lock class. @@ -146,6 +205,33 @@ macro_rules! new_work { } pub use new_work; +/// Creates a [`DelayedWork`] initialiser with the given name and a newly-created lock class. +#[macro_export] +macro_rules! new_delayed_work { + () => { + $crate::workqueue::DelayedWork::new( + $crate::optional_name!(), + $crate::static_lock_class!(), + $crate::c_str!(::core::concat!( + ::core::file!(), + ":", + ::core::line!(), + "_timer" + )), + $crate::static_lock_class!(), + ) + }; + ($name:literal) => { + $crate::workqueue::DelayedWork::new( + $crate::c_str!($name), + $crate::static_lock_class!(), + $crate::c_str!(::core::concat!($name, "_timer")), + $crate::static_lock_class!(), + ) + }; +} +pub use new_delayed_work; + /// A kernel work queue. /// /// Wraps the kernel's C `struct workqueue_struct`. @@ -170,7 +256,7 @@ impl Queue { pub unsafe fn from_raw<'a>(ptr: *const bindings::workqueue_struct) -> &'a Queue { // SAFETY: The `Queue` type is `#[repr(transparent)]`, so the pointer cast is valid. The // caller promises that the pointer is not dangling. - unsafe { &*(ptr as *const Queue) } + unsafe { &*ptr.cast::<Queue>() } } /// Enqueues a work item. @@ -198,7 +284,7 @@ impl Queue { unsafe { w.__enqueue(move |work_ptr| { bindings::queue_work_on( - bindings::wq_misc_consts_WORK_CPU_UNBOUND as _, + bindings::wq_misc_consts_WORK_CPU_UNBOUND as ffi::c_int, queue_ptr, work_ptr, ) @@ -206,6 +292,42 @@ impl Queue { } } + /// Enqueues a delayed work item. + /// + /// This may fail if the work item is already enqueued in a workqueue. + /// + /// The work item will be submitted using `WORK_CPU_UNBOUND`. + pub fn enqueue_delayed<W, const ID: u64>(&self, w: W, delay: Jiffies) -> W::EnqueueOutput + where + W: RawDelayedWorkItem<ID> + Send + 'static, + { + let queue_ptr = self.0.get(); + + // SAFETY: We only return `false` if the `work_struct` is already in a workqueue. The other + // `__enqueue` requirements are not relevant since `W` is `Send` and static. + // + // The call to `bindings::queue_delayed_work_on` will dereference the provided raw pointer, + // which is ok because `__enqueue` guarantees that the pointer is valid for the duration of + // this closure, and the safety requirements of `RawDelayedWorkItem` expands this + // requirement to apply to the entire `delayed_work`. + // + // Furthermore, if the C workqueue code accesses the pointer after this call to + // `__enqueue`, then the work item was successfully enqueued, and + // `bindings::queue_delayed_work_on` will have returned true. In this case, `__enqueue` + // promises that the raw pointer will stay valid until we call the function pointer in the + // `work_struct`, so the access is ok. + unsafe { + w.__enqueue(move |work_ptr| { + bindings::queue_delayed_work_on( + bindings::wq_misc_consts_WORK_CPU_UNBOUND as ffi::c_int, + queue_ptr, + container_of!(work_ptr, bindings::delayed_work, work), + delay, + ) + }) + } + } + /// Tries to spawn the given function or closure as a work item. /// /// This method can fail because it allocates memory to store the work item. @@ -298,6 +420,16 @@ pub unsafe trait RawWorkItem<const ID: u64> { F: FnOnce(*mut bindings::work_struct) -> bool; } +/// A raw delayed work item. +/// +/// # Safety +/// +/// If the `__enqueue` method in the `RawWorkItem` implementation calls the closure, then the +/// provided pointer must point at the `work` field of a valid `delayed_work`, and the guarantees +/// that `__enqueue` provides about accessing the `work_struct` must also apply to the rest of the +/// `delayed_work` struct. +pub unsafe trait RawDelayedWorkItem<const ID: u64>: RawWorkItem<ID> {} + /// Defines the method that should be called directly when a work item is executed. /// /// This trait is implemented by `Pin<KBox<T>>` and [`Arc<T>`], and is mainly intended to be @@ -403,11 +535,11 @@ impl<T: ?Sized, const ID: u64> Work<T, ID> { // // A pointer cast would also be ok due to `#[repr(transparent)]`. We use `addr_of!` so that // the compiler does not complain that the `work` field is unused. - unsafe { Opaque::raw_get(core::ptr::addr_of!((*ptr).work)) } + unsafe { Opaque::cast_into(core::ptr::addr_of!((*ptr).work)) } } } -/// Declares that a type has a [`Work<T, ID>`] field. +/// Declares that a type contains a [`Work<T, ID>`]. /// /// The intended way of using this trait is via the [`impl_has_work!`] macro. You can use the macro /// like this: @@ -506,6 +638,178 @@ impl_has_work! { impl{T} HasWork<Self> for ClosureWork<T> { self.work } } +/// Links for a delayed work item. +/// +/// This struct contains a function pointer to the [`run`] function from the [`WorkItemPointer`] +/// trait, and defines the linked list pointers necessary to enqueue a work item in a workqueue in +/// a delayed manner. +/// +/// Wraps the kernel's C `struct delayed_work`. +/// +/// This is a helper type used to associate a `delayed_work` with the [`WorkItem`] that uses it. +/// +/// [`run`]: WorkItemPointer::run +#[pin_data] +#[repr(transparent)] +pub struct DelayedWork<T: ?Sized, const ID: u64 = 0> { + #[pin] + dwork: Opaque<bindings::delayed_work>, + _inner: PhantomData<T>, +} + +// SAFETY: Kernel work items are usable from any thread. +// +// We do not need to constrain `T` since the work item does not actually contain a `T`. +unsafe impl<T: ?Sized, const ID: u64> Send for DelayedWork<T, ID> {} +// SAFETY: Kernel work items are usable from any thread. +// +// We do not need to constrain `T` since the work item does not actually contain a `T`. +unsafe impl<T: ?Sized, const ID: u64> Sync for DelayedWork<T, ID> {} + +impl<T: ?Sized, const ID: u64> DelayedWork<T, ID> { + /// Creates a new instance of [`DelayedWork`]. + #[inline] + pub fn new( + work_name: &'static CStr, + work_key: Pin<&'static LockClassKey>, + timer_name: &'static CStr, + timer_key: Pin<&'static LockClassKey>, + ) -> impl PinInit<Self> + where + T: WorkItem<ID>, + { + pin_init!(Self { + dwork <- Opaque::ffi_init(|slot: *mut bindings::delayed_work| { + // SAFETY: The `WorkItemPointer` implementation promises that `run` can be used as + // the work item function. + unsafe { + bindings::init_work_with_key( + core::ptr::addr_of_mut!((*slot).work), + Some(T::Pointer::run), + false, + work_name.as_char_ptr(), + work_key.as_ptr(), + ) + } + + // SAFETY: The `delayed_work_timer_fn` function pointer can be used here because + // the timer is embedded in a `struct delayed_work`, and only ever scheduled via + // the core workqueue code, and configured to run in irqsafe context. + unsafe { + bindings::timer_init_key( + core::ptr::addr_of_mut!((*slot).timer), + Some(bindings::delayed_work_timer_fn), + bindings::TIMER_IRQSAFE, + timer_name.as_char_ptr(), + timer_key.as_ptr(), + ) + } + }), + _inner: PhantomData, + }) + } + + /// Get a pointer to the inner `delayed_work`. + /// + /// # Safety + /// + /// The provided pointer must not be dangling and must be properly aligned. (But the memory + /// need not be initialized.) + #[inline] + pub unsafe fn raw_as_work(ptr: *const Self) -> *mut Work<T, ID> { + // SAFETY: The caller promises that the pointer is aligned and not dangling. + let dw: *mut bindings::delayed_work = + unsafe { Opaque::cast_into(core::ptr::addr_of!((*ptr).dwork)) }; + // SAFETY: The caller promises that the pointer is aligned and not dangling. + let wrk: *mut bindings::work_struct = unsafe { core::ptr::addr_of_mut!((*dw).work) }; + // CAST: Work and work_struct have compatible layouts. + wrk.cast() + } +} + +/// Declares that a type contains a [`DelayedWork<T, ID>`]. +/// +/// # Safety +/// +/// The `HasWork<T, ID>` implementation must return a `work_struct` that is stored in the `work` +/// field of a `delayed_work` with the same access rules as the `work_struct`. +pub unsafe trait HasDelayedWork<T, const ID: u64 = 0>: HasWork<T, ID> {} + +/// Used to safely implement the [`HasDelayedWork<T, ID>`] trait. +/// +/// This macro also implements the [`HasWork`] trait, so you do not need to use [`impl_has_work!`] +/// when using this macro. +/// +/// # Examples +/// +/// ``` +/// use kernel::sync::Arc; +/// use kernel::workqueue::{self, impl_has_delayed_work, DelayedWork}; +/// +/// struct MyStruct<'a, T, const N: usize> { +/// work_field: DelayedWork<MyStruct<'a, T, N>, 17>, +/// f: fn(&'a [T; N]), +/// } +/// +/// impl_has_delayed_work! { +/// impl{'a, T, const N: usize} HasDelayedWork<MyStruct<'a, T, N>, 17> +/// for MyStruct<'a, T, N> { self.work_field } +/// } +/// ``` +#[macro_export] +macro_rules! impl_has_delayed_work { + ($(impl$({$($generics:tt)*})? + HasDelayedWork<$work_type:ty $(, $id:tt)?> + for $self:ty + { self.$field:ident } + )*) => {$( + // SAFETY: The implementation of `raw_get_work` only compiles if the field has the right + // type. + unsafe impl$(<$($generics)+>)? + $crate::workqueue::HasDelayedWork<$work_type $(, $id)?> for $self {} + + // SAFETY: The implementation of `raw_get_work` only compiles if the field has the right + // type. + unsafe impl$(<$($generics)+>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self { + #[inline] + unsafe fn raw_get_work( + ptr: *mut Self + ) -> *mut $crate::workqueue::Work<$work_type $(, $id)?> { + // SAFETY: The caller promises that the pointer is not dangling. + let ptr: *mut $crate::workqueue::DelayedWork<$work_type $(, $id)?> = unsafe { + ::core::ptr::addr_of_mut!((*ptr).$field) + }; + + // SAFETY: The caller promises that the pointer is not dangling. + unsafe { $crate::workqueue::DelayedWork::raw_as_work(ptr) } + } + + #[inline] + unsafe fn work_container_of( + ptr: *mut $crate::workqueue::Work<$work_type $(, $id)?>, + ) -> *mut Self { + // SAFETY: The caller promises that the pointer points at a field of the right type + // in the right kind of struct. + let ptr = unsafe { $crate::workqueue::Work::raw_get(ptr) }; + + // SAFETY: The caller promises that the pointer points at a field of the right type + // in the right kind of struct. + let delayed_work = unsafe { + $crate::container_of!(ptr, $crate::bindings::delayed_work, work) + }; + + let delayed_work: *mut $crate::workqueue::DelayedWork<$work_type $(, $id)?> = + delayed_work.cast(); + + // SAFETY: The caller promises that the pointer points at a field of the right type + // in the right kind of struct. + unsafe { $crate::container_of!(delayed_work, Self, $field) } + } + } + )*}; +} +pub use impl_has_delayed_work; + // SAFETY: The `__enqueue` implementation in RawWorkItem uses a `work_struct` initialized with the // `run` method of this trait as the function pointer because: // - `__enqueue` gets the `work_struct` from the `Work` field, using `T::raw_get_work`. @@ -522,7 +826,7 @@ where { unsafe extern "C" fn run(ptr: *mut bindings::work_struct) { // The `__enqueue` method always uses a `work_struct` stored in a `Work<T, ID>`. - let ptr = ptr as *mut Work<T, ID>; + let ptr = ptr.cast::<Work<T, ID>>(); // SAFETY: This computes the pointer that `__enqueue` got from `Arc::into_raw`. let ptr = unsafe { T::work_container_of(ptr) }; // SAFETY: This pointer comes from `Arc::into_raw` and we've been given back ownership. @@ -567,6 +871,16 @@ where } } +// SAFETY: By the safety requirements of `HasDelayedWork`, the `work_struct` returned by methods in +// `HasWork` provides a `work_struct` that is the `work` field of a `delayed_work`, and the rest of +// the `delayed_work` has the same access rules as its `work` field. +unsafe impl<T, const ID: u64> RawDelayedWorkItem<ID> for Arc<T> +where + T: WorkItem<ID, Pointer = Self>, + T: HasDelayedWork<T, ID>, +{ +} + // SAFETY: TODO. unsafe impl<T, const ID: u64> WorkItemPointer<ID> for Pin<KBox<T>> where @@ -575,7 +889,7 @@ where { unsafe extern "C" fn run(ptr: *mut bindings::work_struct) { // The `__enqueue` method always uses a `work_struct` stored in a `Work<T, ID>`. - let ptr = ptr as *mut Work<T, ID>; + let ptr = ptr.cast::<Work<T, ID>>(); // SAFETY: This computes the pointer that `__enqueue` got from `Arc::into_raw`. let ptr = unsafe { T::work_container_of(ptr) }; // SAFETY: This pointer comes from `Arc::into_raw` and we've been given back ownership. @@ -617,6 +931,16 @@ where } } +// SAFETY: By the safety requirements of `HasDelayedWork`, the `work_struct` returned by methods in +// `HasWork` provides a `work_struct` that is the `work` field of a `delayed_work`, and the rest of +// the `delayed_work` has the same access rules as its `work` field. +unsafe impl<T, const ID: u64> RawDelayedWorkItem<ID> for Pin<KBox<T>> +where + T: WorkItem<ID, Pointer = Self>, + T: HasDelayedWork<T, ID>, +{ +} + /// Returns the system work queue (`system_wq`). /// /// It is the one used by `schedule[_delayed]_work[_on]()`. Multi-CPU multi-threaded. There are diff --git a/rust/kernel/xarray.rs b/rust/kernel/xarray.rs index 75719e7bb491..a49d6db28845 100644 --- a/rust/kernel/xarray.rs +++ b/rust/kernel/xarray.rs @@ -7,9 +7,10 @@ use crate::{ alloc, bindings, build_assert, error::{Error, Result}, + ffi::c_void, types::{ForeignOwnable, NotThreadSafe, Opaque}, }; -use core::{iter, marker::PhantomData, mem, pin::Pin, ptr::NonNull}; +use core::{iter, marker::PhantomData, pin::Pin, ptr::NonNull}; use pin_init::{pin_data, pin_init, pinned_drop, PinInit}; /// An array which efficiently maps sparse integer indices to owned objects. @@ -101,7 +102,7 @@ impl<T: ForeignOwnable> XArray<T> { }) } - fn iter(&self) -> impl Iterator<Item = NonNull<T::PointedTo>> + '_ { + fn iter(&self) -> impl Iterator<Item = NonNull<c_void>> + '_ { let mut index = 0; // SAFETY: `self.xa` is always valid by the type invariant. @@ -179,7 +180,7 @@ impl<T> From<StoreError<T>> for Error { impl<'a, T: ForeignOwnable> Guard<'a, T> { fn load<F, U>(&self, index: usize, f: F) -> Option<U> where - F: FnOnce(NonNull<T::PointedTo>) -> U, + F: FnOnce(NonNull<c_void>) -> U, { // SAFETY: `self.xa.xa` is always valid by the type invariant. let ptr = unsafe { bindings::xa_load(self.xa.xa.get(), index) }; @@ -230,7 +231,7 @@ impl<'a, T: ForeignOwnable> Guard<'a, T> { gfp: alloc::Flags, ) -> Result<Option<T>, StoreError<T>> { build_assert!( - mem::align_of::<T::PointedTo>() >= 4, + T::FOREIGN_ALIGN >= 4, "pointers stored in XArray must be 4-byte aligned" ); let new = value.into_foreign(); diff --git a/rust/macros/module.rs b/rust/macros/module.rs index 75efc6eeeafc..5ee54a00c0b6 100644 --- a/rust/macros/module.rs +++ b/rust/macros/module.rs @@ -94,7 +94,6 @@ struct ModuleInfo { type_: String, license: String, name: String, - author: Option<String>, authors: Option<Vec<String>>, description: Option<String>, alias: Option<Vec<String>>, @@ -108,7 +107,6 @@ impl ModuleInfo { const EXPECTED_KEYS: &[&str] = &[ "type", "name", - "author", "authors", "description", "license", @@ -134,7 +132,6 @@ impl ModuleInfo { match key.as_str() { "type" => info.type_ = expect_ident(it), "name" => info.name = expect_string_ascii(it), - "author" => info.author = Some(expect_string(it)), "authors" => info.authors = Some(expect_string_array(it)), "description" => info.description = Some(expect_string(it)), "license" => info.license = expect_string_ascii(it), @@ -179,9 +176,6 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream { // Rust does not allow hyphens in identifiers, use underscore instead. let ident = info.name.replace('-', "_"); let mut modinfo = ModInfoBuilder::new(ident.as_ref()); - if let Some(author) = info.author { - modinfo.emit("author", &author); - } if let Some(authors) = info.authors { for author in authors { modinfo.emit("author", &author); diff --git a/rust/pin-init/README.md b/rust/pin-init/README.md index 2d0cda961d45..a4c01a8d78b2 100644 --- a/rust/pin-init/README.md +++ b/rust/pin-init/README.md @@ -125,7 +125,7 @@ impl DriverData { fn new() -> impl PinInit<Self, Error> { try_pin_init!(Self { status <- CMutex::new(0), - buffer: Box::init(pin_init::zeroed())?, + buffer: Box::init(pin_init::init_zeroed())?, }? Error) } } diff --git a/rust/pin-init/examples/big_struct_in_place.rs b/rust/pin-init/examples/big_struct_in_place.rs index 30d44a334ffd..c05139927486 100644 --- a/rust/pin-init/examples/big_struct_in_place.rs +++ b/rust/pin-init/examples/big_struct_in_place.rs @@ -4,6 +4,7 @@ use pin_init::*; // Struct with size over 1GiB #[derive(Debug)] +#[allow(dead_code)] pub struct BigStruct { buf: [u8; 1024 * 1024 * 1024], a: u64, @@ -20,20 +21,23 @@ pub struct ManagedBuf { impl ManagedBuf { pub fn new() -> impl Init<Self> { - init!(ManagedBuf { buf <- zeroed() }) + init!(ManagedBuf { buf <- init_zeroed() }) } } fn main() { - // we want to initialize the struct in-place, otherwise we would get a stackoverflow - let buf: Box<BigStruct> = Box::init(init!(BigStruct { - buf <- zeroed(), - a: 7, - b: 186, - c: 7789, - d: 34, - managed_buf <- ManagedBuf::new(), - })) - .unwrap(); - println!("{}", core::mem::size_of_val(&*buf)); + #[cfg(any(feature = "std", feature = "alloc"))] + { + // we want to initialize the struct in-place, otherwise we would get a stackoverflow + let buf: Box<BigStruct> = Box::init(init!(BigStruct { + buf <- init_zeroed(), + a: 7, + b: 186, + c: 7789, + d: 34, + managed_buf <- ManagedBuf::new(), + })) + .unwrap(); + println!("{}", core::mem::size_of_val(&*buf)); + } } diff --git a/rust/pin-init/examples/linked_list.rs b/rust/pin-init/examples/linked_list.rs index 0bbc7b8d83a1..f9e117c7dfe0 100644 --- a/rust/pin-init/examples/linked_list.rs +++ b/rust/pin-init/examples/linked_list.rs @@ -14,8 +14,9 @@ use core::{ use pin_init::*; -#[expect(unused_attributes)] +#[allow(unused_attributes)] mod error; +#[allow(unused_imports)] use error::Error; #[pin_data(PinnedDrop)] @@ -39,6 +40,7 @@ impl ListHead { } #[inline] + #[allow(dead_code)] pub fn insert_next(list: &ListHead) -> impl PinInit<Self, Infallible> + '_ { try_pin_init!(&this in Self { prev: list.next.prev().replace(unsafe { Link::new_unchecked(this)}), @@ -112,6 +114,7 @@ impl Link { } #[inline] + #[allow(dead_code)] fn prev(&self) -> &Link { unsafe { &(*self.0.get().as_ptr()).prev } } @@ -138,7 +141,12 @@ impl Link { } #[allow(dead_code)] +#[cfg(not(any(feature = "std", feature = "alloc")))] +fn main() {} + +#[allow(dead_code)] #[cfg_attr(test, test)] +#[cfg(any(feature = "std", feature = "alloc"))] fn main() -> Result<(), Error> { let a = Box::pin_init(ListHead::new())?; stack_pin_init!(let b = ListHead::insert_next(&a)); diff --git a/rust/pin-init/examples/mutex.rs b/rust/pin-init/examples/mutex.rs index 3e3630780c96..9f295226cd64 100644 --- a/rust/pin-init/examples/mutex.rs +++ b/rust/pin-init/examples/mutex.rs @@ -12,14 +12,15 @@ use core::{ pin::Pin, sync::atomic::{AtomicBool, Ordering}, }; +#[cfg(feature = "std")] use std::{ sync::Arc, - thread::{self, park, sleep, Builder, Thread}, + thread::{self, sleep, Builder, Thread}, time::Duration, }; use pin_init::*; -#[expect(unused_attributes)] +#[allow(unused_attributes)] #[path = "./linked_list.rs"] pub mod linked_list; use linked_list::*; @@ -36,6 +37,7 @@ impl SpinLock { .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) .is_err() { + #[cfg(feature = "std")] while self.inner.load(Ordering::Relaxed) { thread::yield_now(); } @@ -94,7 +96,8 @@ impl<T> CMutex<T> { // println!("wait list length: {}", self.wait_list.size()); while self.locked.get() { drop(sguard); - park(); + #[cfg(feature = "std")] + thread::park(); sguard = self.spin_lock.acquire(); } // This does have an effect, as the ListHead inside wait_entry implements Drop! @@ -131,8 +134,11 @@ impl<T> Drop for CMutexGuard<'_, T> { let sguard = self.mtx.spin_lock.acquire(); self.mtx.locked.set(false); if let Some(list_field) = self.mtx.wait_list.next() { - let wait_entry = list_field.as_ptr().cast::<WaitEntry>(); - unsafe { (*wait_entry).thread.unpark() }; + let _wait_entry = list_field.as_ptr().cast::<WaitEntry>(); + #[cfg(feature = "std")] + unsafe { + (*_wait_entry).thread.unpark() + }; } drop(sguard); } @@ -159,52 +165,61 @@ impl<T> DerefMut for CMutexGuard<'_, T> { struct WaitEntry { #[pin] wait_list: ListHead, + #[cfg(feature = "std")] thread: Thread, } impl WaitEntry { #[inline] fn insert_new(list: &ListHead) -> impl PinInit<Self> + '_ { - pin_init!(Self { - thread: thread::current(), - wait_list <- ListHead::insert_prev(list), - }) + #[cfg(feature = "std")] + { + pin_init!(Self { + thread: thread::current(), + wait_list <- ListHead::insert_prev(list), + }) + } + #[cfg(not(feature = "std"))] + { + pin_init!(Self { + wait_list <- ListHead::insert_prev(list), + }) + } } } -#[cfg(not(any(feature = "std", feature = "alloc")))] -fn main() {} - -#[allow(dead_code)] #[cfg_attr(test, test)] -#[cfg(any(feature = "std", feature = "alloc"))] +#[allow(dead_code)] fn main() { - let mtx: Pin<Arc<CMutex<usize>>> = Arc::pin_init(CMutex::new(0)).unwrap(); - let mut handles = vec![]; - let thread_count = 20; - let workload = if cfg!(miri) { 100 } else { 1_000 }; - for i in 0..thread_count { - let mtx = mtx.clone(); - handles.push( - Builder::new() - .name(format!("worker #{i}")) - .spawn(move || { - for _ in 0..workload { - *mtx.lock() += 1; - } - println!("{i} halfway"); - sleep(Duration::from_millis((i as u64) * 10)); - for _ in 0..workload { - *mtx.lock() += 1; - } - println!("{i} finished"); - }) - .expect("should not fail"), - ); - } - for h in handles { - h.join().expect("thread panicked"); + #[cfg(feature = "std")] + { + let mtx: Pin<Arc<CMutex<usize>>> = Arc::pin_init(CMutex::new(0)).unwrap(); + let mut handles = vec![]; + let thread_count = 20; + let workload = if cfg!(miri) { 100 } else { 1_000 }; + for i in 0..thread_count { + let mtx = mtx.clone(); + handles.push( + Builder::new() + .name(format!("worker #{i}")) + .spawn(move || { + for _ in 0..workload { + *mtx.lock() += 1; + } + println!("{i} halfway"); + sleep(Duration::from_millis((i as u64) * 10)); + for _ in 0..workload { + *mtx.lock() += 1; + } + println!("{i} finished"); + }) + .expect("should not fail"), + ); + } + for h in handles { + h.join().expect("thread panicked"); + } + println!("{:?}", &*mtx.lock()); + assert_eq!(*mtx.lock(), workload * thread_count * 2); } - println!("{:?}", &*mtx.lock()); - assert_eq!(*mtx.lock(), workload * thread_count * 2); } diff --git a/rust/pin-init/examples/pthread_mutex.rs b/rust/pin-init/examples/pthread_mutex.rs index 5acc5108b954..49b004c8c137 100644 --- a/rust/pin-init/examples/pthread_mutex.rs +++ b/rust/pin-init/examples/pthread_mutex.rs @@ -44,6 +44,7 @@ mod pthread_mtx { pub enum Error { #[allow(dead_code)] IO(std::io::Error), + #[allow(dead_code)] Alloc, } @@ -61,6 +62,7 @@ mod pthread_mtx { } impl<T> PThreadMutex<T> { + #[allow(dead_code)] pub fn new(data: T) -> impl PinInit<Self, Error> { fn init_raw() -> impl PinInit<UnsafeCell<libc::pthread_mutex_t>, Error> { let init = |slot: *mut UnsafeCell<libc::pthread_mutex_t>| { @@ -103,6 +105,7 @@ mod pthread_mtx { }? Error) } + #[allow(dead_code)] pub fn lock(&self) -> PThreadMutexGuard<'_, T> { // SAFETY: raw is always initialized unsafe { libc::pthread_mutex_lock(self.raw.get()) }; @@ -137,6 +140,7 @@ mod pthread_mtx { } #[cfg_attr(test, test)] +#[cfg_attr(all(test, miri), ignore)] fn main() { #[cfg(all(any(feature = "std", feature = "alloc"), not(windows)))] { diff --git a/rust/pin-init/examples/static_init.rs b/rust/pin-init/examples/static_init.rs index 48531413ab94..0e165daa9798 100644 --- a/rust/pin-init/examples/static_init.rs +++ b/rust/pin-init/examples/static_init.rs @@ -3,6 +3,7 @@ #![allow(clippy::undocumented_unsafe_blocks)] #![cfg_attr(feature = "alloc", feature(allocator_api))] #![cfg_attr(not(RUSTC_LINT_REASONS_IS_STABLE), feature(lint_reasons))] +#![allow(unused_imports)] use core::{ cell::{Cell, UnsafeCell}, @@ -12,12 +13,13 @@ use core::{ time::Duration, }; use pin_init::*; +#[cfg(feature = "std")] use std::{ sync::Arc, thread::{sleep, Builder}, }; -#[expect(unused_attributes)] +#[allow(unused_attributes)] mod mutex; use mutex::*; @@ -82,42 +84,41 @@ unsafe impl PinInit<CMutex<usize>> for CountInit { pub static COUNT: StaticInit<CMutex<usize>, CountInit> = StaticInit::new(CountInit); -#[cfg(not(any(feature = "std", feature = "alloc")))] -fn main() {} - -#[cfg(any(feature = "std", feature = "alloc"))] fn main() { - let mtx: Pin<Arc<CMutex<usize>>> = Arc::pin_init(CMutex::new(0)).unwrap(); - let mut handles = vec![]; - let thread_count = 20; - let workload = 1_000; - for i in 0..thread_count { - let mtx = mtx.clone(); - handles.push( - Builder::new() - .name(format!("worker #{i}")) - .spawn(move || { - for _ in 0..workload { - *COUNT.lock() += 1; - std::thread::sleep(std::time::Duration::from_millis(10)); - *mtx.lock() += 1; - std::thread::sleep(std::time::Duration::from_millis(10)); - *COUNT.lock() += 1; - } - println!("{i} halfway"); - sleep(Duration::from_millis((i as u64) * 10)); - for _ in 0..workload { - std::thread::sleep(std::time::Duration::from_millis(10)); - *mtx.lock() += 1; - } - println!("{i} finished"); - }) - .expect("should not fail"), - ); - } - for h in handles { - h.join().expect("thread panicked"); + #[cfg(feature = "std")] + { + let mtx: Pin<Arc<CMutex<usize>>> = Arc::pin_init(CMutex::new(0)).unwrap(); + let mut handles = vec![]; + let thread_count = 20; + let workload = 1_000; + for i in 0..thread_count { + let mtx = mtx.clone(); + handles.push( + Builder::new() + .name(format!("worker #{i}")) + .spawn(move || { + for _ in 0..workload { + *COUNT.lock() += 1; + std::thread::sleep(std::time::Duration::from_millis(10)); + *mtx.lock() += 1; + std::thread::sleep(std::time::Duration::from_millis(10)); + *COUNT.lock() += 1; + } + println!("{i} halfway"); + sleep(Duration::from_millis((i as u64) * 10)); + for _ in 0..workload { + std::thread::sleep(std::time::Duration::from_millis(10)); + *mtx.lock() += 1; + } + println!("{i} finished"); + }) + .expect("should not fail"), + ); + } + for h in handles { + h.join().expect("thread panicked"); + } + println!("{:?}, {:?}", &*mtx.lock(), &*COUNT.lock()); + assert_eq!(*mtx.lock(), workload * thread_count * 2); } - println!("{:?}, {:?}", &*mtx.lock(), &*COUNT.lock()); - assert_eq!(*mtx.lock(), workload * thread_count * 2); } diff --git a/rust/pin-init/src/__internal.rs b/rust/pin-init/src/__internal.rs index 557b5948cddc..90f18e9a2912 100644 --- a/rust/pin-init/src/__internal.rs +++ b/rust/pin-init/src/__internal.rs @@ -188,6 +188,7 @@ impl<T> StackInit<T> { } #[test] +#[cfg(feature = "std")] fn stack_init_reuse() { use ::std::{borrow::ToOwned, println, string::String}; use core::pin::pin; diff --git a/rust/pin-init/src/lib.rs b/rust/pin-init/src/lib.rs index f4e034497cdd..62e013a5cc20 100644 --- a/rust/pin-init/src/lib.rs +++ b/rust/pin-init/src/lib.rs @@ -148,7 +148,7 @@ //! fn new() -> impl PinInit<Self, Error> { //! try_pin_init!(Self { //! status <- CMutex::new(0), -//! buffer: Box::init(pin_init::zeroed())?, +//! buffer: Box::init(pin_init::init_zeroed())?, //! }? Error) //! } //! } @@ -742,7 +742,7 @@ macro_rules! stack_try_pin_init { /// - Fields that you want to initialize in-place have to use `<-` instead of `:`. /// - In front of the initializer you can write `&this in` to have access to a [`NonNull<Self>`] /// pointer named `this` inside of the initializer. -/// - Using struct update syntax one can place `..Zeroable::zeroed()` at the very end of the +/// - Using struct update syntax one can place `..Zeroable::init_zeroed()` at the very end of the /// struct, this initializes every field with 0 and then runs all initializers specified in the /// body. This can only be done if [`Zeroable`] is implemented for the struct. /// @@ -769,7 +769,7 @@ macro_rules! stack_try_pin_init { /// }); /// let init = pin_init!(Buf { /// buf: [1; 64], -/// ..Zeroable::zeroed() +/// ..Zeroable::init_zeroed() /// }); /// ``` /// @@ -805,7 +805,7 @@ macro_rules! pin_init { /// ```rust /// # #![feature(allocator_api)] /// # #[path = "../examples/error.rs"] mod error; use error::Error; -/// use pin_init::{pin_data, try_pin_init, PinInit, InPlaceInit, zeroed}; +/// use pin_init::{pin_data, try_pin_init, PinInit, InPlaceInit, init_zeroed}; /// /// #[pin_data] /// struct BigBuf { @@ -817,7 +817,7 @@ macro_rules! pin_init { /// impl BigBuf { /// fn new() -> impl PinInit<Self, Error> { /// try_pin_init!(Self { -/// big: Box::init(zeroed())?, +/// big: Box::init(init_zeroed())?, /// small: [0; 1024 * 1024], /// ptr: core::ptr::null_mut(), /// }? Error) @@ -866,7 +866,7 @@ macro_rules! try_pin_init { /// # #[path = "../examples/error.rs"] mod error; use error::Error; /// # #[path = "../examples/mutex.rs"] mod mutex; use mutex::*; /// # use pin_init::InPlaceInit; -/// use pin_init::{init, Init, zeroed}; +/// use pin_init::{init, Init, init_zeroed}; /// /// struct BigBuf { /// small: [u8; 1024 * 1024], @@ -875,7 +875,7 @@ macro_rules! try_pin_init { /// impl BigBuf { /// fn new() -> impl Init<Self> { /// init!(Self { -/// small <- zeroed(), +/// small <- init_zeroed(), /// }) /// } /// } @@ -913,7 +913,7 @@ macro_rules! init { /// # #![feature(allocator_api)] /// # use core::alloc::AllocError; /// # use pin_init::InPlaceInit; -/// use pin_init::{try_init, Init, zeroed}; +/// use pin_init::{try_init, Init, init_zeroed}; /// /// struct BigBuf { /// big: Box<[u8; 1024 * 1024 * 1024]>, @@ -923,7 +923,7 @@ macro_rules! init { /// impl BigBuf { /// fn new() -> impl Init<Self, AllocError> { /// try_init!(Self { -/// big: Box::init(zeroed())?, +/// big: Box::init(init_zeroed())?, /// small: [0; 1024 * 1024], /// }? AllocError) /// } @@ -953,7 +953,7 @@ macro_rules! try_init { /// Asserts that a field on a struct using `#[pin_data]` is marked with `#[pin]` ie. that it is /// structurally pinned. /// -/// # Example +/// # Examples /// /// This will succeed: /// ``` @@ -1170,7 +1170,7 @@ pub unsafe trait Init<T: ?Sized, E = Infallible>: PinInit<T, E> { /// /// ```rust /// # #![expect(clippy::disallowed_names)] - /// use pin_init::{init, zeroed, Init}; + /// use pin_init::{init, init_zeroed, Init}; /// /// struct Foo { /// buf: [u8; 1_000_000], @@ -1183,7 +1183,7 @@ pub unsafe trait Init<T: ?Sized, E = Infallible>: PinInit<T, E> { /// } /// /// let foo = init!(Foo { - /// buf <- zeroed() + /// buf <- init_zeroed() /// }).chain(|foo| { /// foo.setup(); /// Ok(()) @@ -1495,7 +1495,45 @@ pub unsafe trait PinnedDrop: __internal::HasPinData { /// ```rust,ignore /// let val: Self = unsafe { core::mem::zeroed() }; /// ``` -pub unsafe trait Zeroable {} +pub unsafe trait Zeroable { + /// Create a new zeroed `Self`. + /// + /// The returned initializer will write `0x00` to every byte of the given `slot`. + #[inline] + fn init_zeroed() -> impl Init<Self> + where + Self: Sized, + { + init_zeroed() + } + + /// Create a `Self` consisting of all zeroes. + /// + /// Whenever a type implements [`Zeroable`], this function should be preferred over + /// [`core::mem::zeroed()`] or using `MaybeUninit<T>::zeroed().assume_init()`. + /// + /// # Examples + /// + /// ``` + /// use pin_init::{Zeroable, zeroed}; + /// + /// #[derive(Zeroable)] + /// struct Point { + /// x: u32, + /// y: u32, + /// } + /// + /// let point: Point = zeroed(); + /// assert_eq!(point.x, 0); + /// assert_eq!(point.y, 0); + /// ``` + fn zeroed() -> Self + where + Self: Sized, + { + zeroed() + } +} /// Marker trait for types that allow `Option<Self>` to be set to all zeroes in order to write /// `None` to that location. @@ -1508,11 +1546,21 @@ pub unsafe trait ZeroableOption {} // SAFETY: by the safety requirement of `ZeroableOption`, this is valid. unsafe impl<T: ZeroableOption> Zeroable for Option<T> {} -/// Create a new zeroed T. +// SAFETY: `Option<&T>` is part of the option layout optimization guarantee: +// <https://doc.rust-lang.org/stable/std/option/index.html#representation>. +unsafe impl<T> ZeroableOption for &T {} +// SAFETY: `Option<&mut T>` is part of the option layout optimization guarantee: +// <https://doc.rust-lang.org/stable/std/option/index.html#representation>. +unsafe impl<T> ZeroableOption for &mut T {} +// SAFETY: `Option<NonNull<T>>` is part of the option layout optimization guarantee: +// <https://doc.rust-lang.org/stable/std/option/index.html#representation>. +unsafe impl<T> ZeroableOption for NonNull<T> {} + +/// Create an initializer for a zeroed `T`. /// /// The returned initializer will write `0x00` to every byte of the given `slot`. #[inline] -pub fn zeroed<T: Zeroable>() -> impl Init<T> { +pub fn init_zeroed<T: Zeroable>() -> impl Init<T> { // SAFETY: Because `T: Zeroable`, all bytes zero is a valid bit pattern for `T` // and because we write all zeroes, the memory is initialized. unsafe { @@ -1523,6 +1571,31 @@ pub fn zeroed<T: Zeroable>() -> impl Init<T> { } } +/// Create a `T` consisting of all zeroes. +/// +/// Whenever a type implements [`Zeroable`], this function should be preferred over +/// [`core::mem::zeroed()`] or using `MaybeUninit<T>::zeroed().assume_init()`. +/// +/// # Examples +/// +/// ``` +/// use pin_init::{Zeroable, zeroed}; +/// +/// #[derive(Zeroable)] +/// struct Point { +/// x: u32, +/// y: u32, +/// } +/// +/// let point: Point = zeroed(); +/// assert_eq!(point.x, 0); +/// assert_eq!(point.y, 0); +/// ``` +pub const fn zeroed<T: Zeroable>() -> T { + // SAFETY:By the type invariants of `Zeroable`, all zeroes is a valid bit pattern for `T`. + unsafe { core::mem::zeroed() } +} + macro_rules! impl_zeroable { ($($({$($generics:tt)*})? $t:ty, )*) => { // SAFETY: Safety comments written in the macro invocation. @@ -1560,7 +1633,6 @@ impl_zeroable! { Option<NonZeroU128>, Option<NonZeroUsize>, Option<NonZeroI8>, Option<NonZeroI16>, Option<NonZeroI32>, Option<NonZeroI64>, Option<NonZeroI128>, Option<NonZeroIsize>, - {<T>} Option<NonNull<T>>, // SAFETY: `null` pointer is valid. // @@ -1590,6 +1662,22 @@ macro_rules! impl_tuple_zeroable { impl_tuple_zeroable!(A, B, C, D, E, F, G, H, I, J); +macro_rules! impl_fn_zeroable_option { + ([$($abi:literal),* $(,)?] $args:tt) => { + $(impl_fn_zeroable_option!({extern $abi} $args);)* + $(impl_fn_zeroable_option!({unsafe extern $abi} $args);)* + }; + ({$($prefix:tt)*} {$(,)?}) => {}; + ({$($prefix:tt)*} {$ret:ident, $($rest:ident),* $(,)?}) => { + // SAFETY: function pointers are part of the option layout optimization: + // <https://doc.rust-lang.org/stable/std/option/index.html#representation>. + unsafe impl<$ret, $($rest),*> ZeroableOption for $($prefix)* fn($($rest),*) -> $ret {} + impl_fn_zeroable_option!({$($prefix)*} {$($rest),*,}); + }; +} + +impl_fn_zeroable_option!(["Rust", "C"] { A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U }); + /// This trait allows creating an instance of `Self` which contains exactly one /// [structurally pinned value](https://doc.rust-lang.org/std/pin/index.html#projections-and-structural-pinning). /// diff --git a/rust/pin-init/src/macros.rs b/rust/pin-init/src/macros.rs index 935d77745d1d..9ced630737b8 100644 --- a/rust/pin-init/src/macros.rs +++ b/rust/pin-init/src/macros.rs @@ -1030,7 +1030,7 @@ macro_rules! __pin_data { /// /// This macro has multiple internal call configurations, these are always the very first ident: /// - nothing: this is the base case and called by the `{try_}{pin_}init!` macros. -/// - `with_update_parsed`: when the `..Zeroable::zeroed()` syntax has been handled. +/// - `with_update_parsed`: when the `..Zeroable::init_zeroed()` syntax has been handled. /// - `init_slot`: recursively creates the code that initializes all fields in `slot`. /// - `make_initializer`: recursively create the struct initializer that guarantees that every /// field has been initialized exactly once. @@ -1059,7 +1059,7 @@ macro_rules! __init_internal { @data($data, $($use_data)?), @has_data($has_data, $get_data), @construct_closure($construct_closure), - @zeroed(), // Nothing means default behavior. + @init_zeroed(), // Nothing means default behavior. ) }; ( @@ -1074,7 +1074,7 @@ macro_rules! __init_internal { @has_data($has_data:ident, $get_data:ident), // `pin_init_from_closure` or `init_from_closure`. @construct_closure($construct_closure:ident), - @munch_fields(..Zeroable::zeroed()), + @munch_fields(..Zeroable::init_zeroed()), ) => { $crate::__init_internal!(with_update_parsed: @this($($this)?), @@ -1084,7 +1084,7 @@ macro_rules! __init_internal { @data($data, $($use_data)?), @has_data($has_data, $get_data), @construct_closure($construct_closure), - @zeroed(()), // `()` means zero all fields not mentioned. + @init_zeroed(()), // `()` means zero all fields not mentioned. ) }; ( @@ -1124,7 +1124,7 @@ macro_rules! __init_internal { @has_data($has_data:ident, $get_data:ident), // `pin_init_from_closure` or `init_from_closure`. @construct_closure($construct_closure:ident), - @zeroed($($init_zeroed:expr)?), + @init_zeroed($($init_zeroed:expr)?), ) => {{ // We do not want to allow arbitrary returns, so we declare this type as the `Ok` return // type and shadow it later when we insert the arbitrary user code. That way there will be @@ -1196,7 +1196,7 @@ macro_rules! __init_internal { @data($data:ident), @slot($slot:ident), @guards($($guards:ident,)*), - @munch_fields($(..Zeroable::zeroed())? $(,)?), + @munch_fields($(..Zeroable::init_zeroed())? $(,)?), ) => { // Endpoint of munching, no fields are left. If execution reaches this point, all fields // have been initialized. Therefore we can now dismiss the guards by forgetting them. @@ -1300,11 +1300,11 @@ macro_rules! __init_internal { (make_initializer: @slot($slot:ident), @type_name($t:path), - @munch_fields(..Zeroable::zeroed() $(,)?), + @munch_fields(..Zeroable::init_zeroed() $(,)?), @acc($($acc:tt)*), ) => { // Endpoint, nothing more to munch, create the initializer. Since the users specified - // `..Zeroable::zeroed()`, the slot will already have been zeroed and all field that have + // `..Zeroable::init_zeroed()`, the slot will already have been zeroed and all field that have // not been overwritten are thus zero and initialized. We still check that all fields are // actually accessible by using the struct update syntax ourselves. // We are inside of a closure that is never executed and thus we can abuse `slot` to diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs index c98d7a8cde77..31c2f713313f 100644 --- a/rust/uapi/lib.rs +++ b/rust/uapi/lib.rs @@ -14,6 +14,9 @@ #![cfg_attr(test, allow(unsafe_op_in_unsafe_fn))] #![allow( clippy::all, + clippy::cast_lossless, + clippy::ptr_as_ptr, + clippy::ref_as_ptr, clippy::undocumented_unsafe_blocks, dead_code, missing_docs, diff --git a/samples/Kconfig b/samples/Kconfig index ffef99950206..6e072a5f1ed8 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -54,7 +54,7 @@ config SAMPLE_FTRACE_OPS measures the time taken to invoke one function a number of times. config SAMPLE_TRACE_ARRAY - tristate "Build sample module for kernel access to Ftrace instancess" + tristate "Build sample module for kernel access to Ftrace instances" depends on EVENT_TRACING && m help This builds a module that demonstrates the use of various APIs to @@ -316,10 +316,9 @@ config SAMPLE_HUNG_TASK depends on DETECT_HUNG_TASK && DEBUG_FS help Build a module that provides debugfs files (e.g., mutex, semaphore, - etc.) under <debugfs>/hung_task. If user reads one of these files, - it will sleep long time (256 seconds) with holding a lock. Thus, - if 2 or more processes read the same file concurrently, it will - be detected by the hung_task watchdog. + rw_semaphore_read, rw_semaphore_write) under <debugfs>/hung_task. + Reading these files with multiple processes triggers hung task + detection by holding locks for a long time (256 seconds). source "samples/rust/Kconfig" diff --git a/samples/hung_task/hung_task_tests.c b/samples/hung_task/hung_task_tests.c index a5c09bd3a47d..0360ec916890 100644 --- a/samples/hung_task/hung_task_tests.c +++ b/samples/hung_task/hung_task_tests.c @@ -4,11 +4,12 @@ * semaphore, etc. * * Usage: Load this module and read `<debugfs>/hung_task/mutex`, - * `<debugfs>/hung_task/semaphore`, etc., with 2 or more processes. + * `<debugfs>/hung_task/semaphore`, `<debugfs>/hung_task/rw_semaphore_read`, + * `<debugfs>/hung_task/rw_semaphore_write`, etc., with 2 or more processes. * * This is for testing kernel hung_task error messages with various locking - * mechanisms (e.g., mutex, semaphore, etc.). Note that this may freeze - * your system or cause a panic. Use only for testing purposes. + * mechanisms (e.g., mutex, semaphore, rw_semaphore_read, rw_semaphore_write, etc.). + * Note that this may freeze your system or cause a panic. Use only for testing purposes. */ #include <linux/debugfs.h> @@ -17,21 +18,29 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/semaphore.h> +#include <linux/rwsem.h> -#define HUNG_TASK_DIR "hung_task" -#define HUNG_TASK_MUTEX_FILE "mutex" -#define HUNG_TASK_SEM_FILE "semaphore" -#define SLEEP_SECOND 256 +#define HUNG_TASK_DIR "hung_task" +#define HUNG_TASK_MUTEX_FILE "mutex" +#define HUNG_TASK_SEM_FILE "semaphore" +#define HUNG_TASK_RWSEM_READ_FILE "rw_semaphore_read" +#define HUNG_TASK_RWSEM_WRITE_FILE "rw_semaphore_write" +#define SLEEP_SECOND 256 static const char dummy_string[] = "This is a dummy string."; static DEFINE_MUTEX(dummy_mutex); static DEFINE_SEMAPHORE(dummy_sem, 1); +static DECLARE_RWSEM(dummy_rwsem); static struct dentry *hung_task_dir; /* Mutex-based read function */ static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + /* Second task waits on mutex, entering uninterruptible sleep */ guard(mutex)(&dummy_mutex); @@ -46,6 +55,10 @@ static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf, static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + /* Second task waits on semaphore, entering uninterruptible sleep */ down(&dummy_sem); @@ -58,6 +71,46 @@ static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf, sizeof(dummy_string)); } +/* Read-write semaphore read function */ +static ssize_t read_dummy_rwsem_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + + /* Acquires read lock, allowing concurrent readers but blocks if write lock is held */ + down_read(&dummy_rwsem); + + /* Sleeps here, potentially triggering hung task detection if lock is held too long */ + msleep_interruptible(SLEEP_SECOND * 1000); + + up_read(&dummy_rwsem); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* Read-write semaphore write function */ +static ssize_t read_dummy_rwsem_write(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + + /* Acquires exclusive write lock, blocking all other readers and writers */ + down_write(&dummy_rwsem); + + /* Sleeps here, potentially triggering hung task detection if lock is held too long */ + msleep_interruptible(SLEEP_SECOND * 1000); + + up_write(&dummy_rwsem); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + /* File operations for mutex */ static const struct file_operations hung_task_mutex_fops = { .read = read_dummy_mutex, @@ -68,6 +121,16 @@ static const struct file_operations hung_task_sem_fops = { .read = read_dummy_semaphore, }; +/* File operations for rw_semaphore read */ +static const struct file_operations hung_task_rwsem_read_fops = { + .read = read_dummy_rwsem_read, +}; + +/* File operations for rw_semaphore write */ +static const struct file_operations hung_task_rwsem_write_fops = { + .read = read_dummy_rwsem_write, +}; + static int __init hung_task_tests_init(void) { hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL); @@ -79,6 +142,10 @@ static int __init hung_task_tests_init(void) &hung_task_mutex_fops); debugfs_create_file(HUNG_TASK_SEM_FILE, 0400, hung_task_dir, NULL, &hung_task_sem_fops); + debugfs_create_file(HUNG_TASK_RWSEM_READ_FILE, 0400, hung_task_dir, NULL, + &hung_task_rwsem_read_fops); + debugfs_create_file(HUNG_TASK_RWSEM_WRITE_FILE, 0400, hung_task_dir, NULL, + &hung_task_rwsem_write_fops); return 0; } diff --git a/samples/rust/rust_configfs.rs b/samples/rust/rust_configfs.rs index 60ddbe62cda3..af04bfa35cb2 100644 --- a/samples/rust/rust_configfs.rs +++ b/samples/rust/rust_configfs.rs @@ -14,7 +14,7 @@ use kernel::sync::Mutex; module! { type: RustConfigfs, name: "rust_configfs", - author: "Rust for Linux Contributors", + authors: ["Rust for Linux Contributors"], description: "Rust configfs sample", license: "GPL", } diff --git a/samples/rust/rust_driver_auxiliary.rs b/samples/rust/rust_driver_auxiliary.rs index b25628604a93..f2a820683fc3 100644 --- a/samples/rust/rust_driver_auxiliary.rs +++ b/samples/rust/rust_driver_auxiliary.rs @@ -113,7 +113,7 @@ impl InPlaceModule for SampleModule { module! { type: SampleModule, name: "rust_driver_auxiliary", - author: "Danilo Krummrich", + authors: ["Danilo Krummrich"], description: "Rust auxiliary driver", license: "GPL v2", } diff --git a/samples/rust/rust_misc_device.rs b/samples/rust/rust_misc_device.rs index c881fd6dbd08..e7ab77448f75 100644 --- a/samples/rust/rust_misc_device.rs +++ b/samples/rust/rust_misc_device.rs @@ -176,6 +176,8 @@ impl MiscDevice for RustMiscDevice { fn ioctl(me: Pin<&RustMiscDevice>, _file: &File, cmd: u32, arg: usize) -> Result<isize> { dev_info!(me.dev, "IOCTLing Rust Misc Device Sample\n"); + // Treat the ioctl argument as a user pointer. + let arg = UserPtr::from_addr(arg); let size = _IOC_SIZE(cmd); match cmd { diff --git a/samples/rust/rust_print_main.rs b/samples/rust/rust_print_main.rs index 8ea95e8c2f36..4095c72afeab 100644 --- a/samples/rust/rust_print_main.rs +++ b/samples/rust/rust_print_main.rs @@ -40,7 +40,7 @@ fn arc_print() -> Result { // behaviour, contract or protocol on both `i32` and `&str` into a single `Arc` of // type `Arc<dyn Display>`. - use core::fmt::Display; + use kernel::fmt::Display; fn arc_dyn_print(arc: &Arc<dyn Display>) { pr_info!("Arc<dyn Display> says {arc}"); } diff --git a/scripts/Makefile.build b/scripts/Makefile.build index ba71b27aa363..d0ee33a487be 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -309,14 +309,15 @@ $(obj)/%.lst: $(obj)/%.c FORCE # The features in this list are the ones allowed for non-`rust/` code. # # - Stable since Rust 1.81.0: `feature(lint_reasons)`. -# - Stable since Rust 1.82.0: `feature(asm_const)`, `feature(raw_ref_op)`. +# - Stable since Rust 1.82.0: `feature(asm_const)`, +# `feature(offset_of_nested)`, `feature(raw_ref_op)`. # - Stable since Rust 1.87.0: `feature(asm_goto)`. # - Expected to become stable: `feature(arbitrary_self_types)`. # - To be determined: `feature(used_with_arg)`. # # Please see https://github.com/Rust-for-Linux/linux/issues/2 for details on # the unstable features in use. -rust_allowed_features := asm_const,asm_goto,arbitrary_self_types,lint_reasons,raw_ref_op,used_with_arg +rust_allowed_features := asm_const,asm_goto,arbitrary_self_types,lint_reasons,offset_of_nested,raw_ref_op,used_with_arg # `--out-dir` is required to avoid temporaries being created by `rustc` in the # current working directory, which may be not accessible in the out-of-tree diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 22a6de59b77b..e722dd6fa8ef 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -685,6 +685,9 @@ our $tracing_logging_tags = qr{(?xi: [\.\!:\s]* )}; +# Device ID types like found in include/linux/mod_devicetable.h. +our $dev_id_types = qr{\b[a-z]\w*_device_id\b}; + sub edit_distance_min { my (@arr) = @_; my $len = scalar @arr; @@ -3500,9 +3503,10 @@ sub process { # Check for various typo / spelling mistakes if (defined($misspellings) && ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) { - while ($rawline =~ /(?:^|[^\w\-'`])($misspellings)(?:[^\w\-'`]|$)/gi) { + my $rawline_utf8 = decode("utf8", $rawline); + while ($rawline_utf8 =~ /(?:^|[^\w\-'`])($misspellings)(?:[^\w\-'`]|$)/gi) { my $typo = $1; - my $blank = copy_spacing($rawline); + my $blank = copy_spacing($rawline_utf8); my $ptr = substr($blank, 0, $-[1]) . "^" x length($typo); my $hereptr = "$hereline$ptr\n"; my $typo_fix = $spelling_fix{lc($typo)}; @@ -7688,6 +7692,31 @@ sub process { WARN("DUPLICATED_SYSCTL_CONST", "duplicated sysctl range checking value '$1', consider using the shared one in include/linux/sysctl.h\n" . $herecurr); } + +# Check that *_device_id tables have sentinel entries. + if (defined $stat && $line =~ /struct\s+$dev_id_types\s+\w+\s*\[\s*\]\s*=\s*\{/) { + my $stripped = $stat; + + # Strip diff line prefixes. + $stripped =~ s/(^|\n)./$1/g; + # Line continuations. + $stripped =~ s/\\\n/\n/g; + # Strip whitespace, empty strings, zeroes, and commas. + $stripped =~ s/""//g; + $stripped =~ s/0x0//g; + $stripped =~ s/[\s$;,0]//g; + # Strip field assignments. + $stripped =~ s/\.$Ident=//g; + + if (!(substr($stripped, -4) eq "{}};" || + substr($stripped, -6) eq "{{}}};" || + $stripped =~ /ISAPNP_DEVICE_SINGLE_END}};$/ || + $stripped =~ /ISAPNP_CARD_END}};$/ || + $stripped =~ /NULL};$/ || + $stripped =~ /PCMCIA_DEVICE_NULL};$/)) { + ERROR("MISSING_SENTINEL", "missing sentinel in ID array\n" . "$here\n$stat\n"); + } + } } # If we have no input at all, then there is nothing to report on diff --git a/scripts/coccinelle/misc/secs_to_jiffies.cocci b/scripts/coccinelle/misc/secs_to_jiffies.cocci index 416f348174ca..f3241ce75a7b 100644 --- a/scripts/coccinelle/misc/secs_to_jiffies.cocci +++ b/scripts/coccinelle/misc/secs_to_jiffies.cocci @@ -7,26 +7,65 @@ // Confidence: High // Copyright: (C) 2024 Easwar Hariharan, Microsoft // Keywords: secs, seconds, jiffies -// +// Options: --include-headers virtual patch +virtual report +virtual context -@depends on patch@ constant C; @@ +@pconst depends on patch@ constant C; @@ - msecs_to_jiffies(C * 1000) + secs_to_jiffies(C) -@depends on patch@ constant C; @@ +@pconstms depends on patch@ constant C; @@ - msecs_to_jiffies(C * MSEC_PER_SEC) + secs_to_jiffies(C) -@depends on patch@ expression E; @@ +@pexpr depends on patch@ expression E; @@ - msecs_to_jiffies(E * 1000) + secs_to_jiffies(E) -@depends on patch@ expression E; @@ +@pexprms depends on patch@ expression E; @@ - msecs_to_jiffies(E * MSEC_PER_SEC) + secs_to_jiffies(E) + +@r depends on report && !patch@ +constant C; +expression E; +position p; +@@ + +( + msecs_to_jiffies(C@p * 1000) +| + msecs_to_jiffies(C@p * MSEC_PER_SEC) +| + msecs_to_jiffies(E@p * 1000) +| + msecs_to_jiffies(E@p * MSEC_PER_SEC) +) + +@c depends on context && !patch@ +constant C; +expression E; +@@ + +( +* msecs_to_jiffies(C * 1000) +| +* msecs_to_jiffies(C * MSEC_PER_SEC) +| +* msecs_to_jiffies(E * 1000) +| +* msecs_to_jiffies(E * MSEC_PER_SEC) +) + +@script:python depends on report@ +p << r.p; +@@ + +coccilib.report.print_report(p[0], "WARNING opportunity for secs_to_jiffies()") diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index f795302ddfa8..c3886739a028 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -74,12 +74,12 @@ if IS_BUILTIN(CONFIG_MODULES): LX_GDBPARSED(MOD_RO_AFTER_INIT) /* linux/mount.h */ -LX_VALUE(MNT_NOSUID) -LX_VALUE(MNT_NODEV) -LX_VALUE(MNT_NOEXEC) -LX_VALUE(MNT_NOATIME) -LX_VALUE(MNT_NODIRATIME) -LX_VALUE(MNT_RELATIME) +LX_GDBPARSED(MNT_NOSUID) +LX_GDBPARSED(MNT_NODEV) +LX_GDBPARSED(MNT_NOEXEC) +LX_GDBPARSED(MNT_NOATIME) +LX_GDBPARSED(MNT_NODIRATIME) +LX_GDBPARSED(MNT_RELATIME) /* linux/threads.h */ LX_VALUE(NR_CPUS) diff --git a/scripts/rustdoc_test_gen.rs b/scripts/rustdoc_test_gen.rs index 1ca253594d38..abb34ada2508 100644 --- a/scripts/rustdoc_test_gen.rs +++ b/scripts/rustdoc_test_gen.rs @@ -85,24 +85,25 @@ fn find_real_path<'a>(srctree: &Path, valid_paths: &'a mut Vec<PathBuf>, file: & } } - assert!( - valid_paths.len() > 0, - "No path candidates found for `{file}`. This is likely a bug in the build system, or some \ - files went away while compiling." - ); - - if valid_paths.len() > 1 { - eprintln!("Several path candidates found:"); - for path in valid_paths { - eprintln!(" {path:?}"); + match valid_paths.as_slice() { + [] => panic!( + "No path candidates found for `{file}`. This is likely a bug in the build system, or \ + some files went away while compiling." + ), + [valid_path] => valid_path.to_str().unwrap(), + valid_paths => { + use std::fmt::Write; + + let mut candidates = String::new(); + for path in valid_paths { + writeln!(&mut candidates, " {path:?}").unwrap(); + } + panic!( + "Several path candidates found for `{file}`, please resolve the ambiguity by \ + renaming a file or folder. Candidates:\n{candidates}", + ); } - panic!( - "Several path candidates found for `{file}`, please resolve the ambiguity by renaming \ - a file or folder." - ); } - - valid_paths[0].to_str().unwrap() } fn main() { diff --git a/scripts/spelling.txt b/scripts/spelling.txt index ac94fa1c2415..1e89b92c2f9a 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1099,6 +1099,7 @@ notication||notification notications||notifications notifcations||notifications notifed||notified +notifer||notifier notity||notify notfify||notify nubmer||number diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index b9c5879dd599..12fb419714c0 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o task.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ resource.o secid.o file.o policy_ns.o label.o mount.o net.o \ - policy_compat.o + policy_compat.o af_unix.o apparmor-$(CONFIG_SECURITY_APPARMOR_HASH) += crypto.o obj-$(CONFIG_SECURITY_APPARMOR_KUNIT_TEST) += apparmor_policy_unpack_test.o @@ -28,7 +28,7 @@ clean-files := capability_names.h rlim_names.h net_names.h # to # #define AA_SFS_AF_MASK "local inet" quiet_cmd_make-af = GEN $@ -cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ;\ +cmd_make-af = echo "static const char *const address_family_names[] = {" > $@ ;\ sed $< >>$@ -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e "/AF_ROUTE/d" -e \ 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ echo "};" >> $@ ;\ @@ -43,7 +43,7 @@ cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ;\ # to # [1] = "stream", quiet_cmd_make-sock = GEN $@ -cmd_make-sock = echo "static const char *sock_type_names[] = {" >> $@ ;\ +cmd_make-sock = echo "static const char *const sock_type_names[] = {" >> $@ ;\ sed $^ >>$@ -r -n \ -e 's/^\tSOCK_([A-Z0-9_]+)[\t]+=[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ echo "};" >> $@ diff --git a/security/apparmor/af_unix.c b/security/apparmor/af_unix.c new file mode 100644 index 000000000000..9129766d1e9c --- /dev/null +++ b/security/apparmor/af_unix.c @@ -0,0 +1,799 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AppArmor security module + * + * This file contains AppArmor af_unix fine grained mediation + * + * Copyright 2023 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/fs.h> +#include <net/tcp_states.h> + +#include "include/audit.h" +#include "include/af_unix.h" +#include "include/apparmor.h" +#include "include/file.h" +#include "include/label.h" +#include "include/path.h" +#include "include/policy.h" +#include "include/cred.h" + + +static inline struct sock *aa_unix_sk(struct unix_sock *u) +{ + return &u->sk; +} + +static int unix_fs_perm(const char *op, u32 mask, const struct cred *subj_cred, + struct aa_label *label, struct path *path) +{ + AA_BUG(!label); + AA_BUG(!path); + + if (unconfined(label) || !label_mediates(label, AA_CLASS_FILE)) + return 0; + + mask &= NET_FS_PERMS; + /* if !u->path.dentry socket is being shutdown - implicit delegation + * until obj delegation is supported + */ + if (path->dentry) { + /* the sunpath may not be valid for this ns so use the path */ + struct inode *inode = path->dentry->d_inode; + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), inode); + struct path_cond cond = { + .uid = vfsuid_into_kuid(vfsuid), + .mode = inode->i_mode, + }; + + return aa_path_perm(op, subj_cred, label, path, + PATH_SOCK_COND, mask, &cond); + } /* else implicitly delegated */ + + return 0; +} + +/* match_addr special constants */ +#define ABSTRACT_ADDR "\x00" /* abstract socket addr */ +#define ANONYMOUS_ADDR "\x01" /* anonymous endpoint, no addr */ +#define DISCONNECTED_ADDR "\x02" /* addr is another namespace */ +#define SHUTDOWN_ADDR "\x03" /* path addr is shutdown and cleared */ +#define FS_ADDR "/" /* path addr in fs */ + +static aa_state_t match_addr(struct aa_dfa *dfa, aa_state_t state, + struct sockaddr_un *addr, int addrlen) +{ + if (addr) + /* include leading \0 */ + state = aa_dfa_match_len(dfa, state, addr->sun_path, + unix_addr_len(addrlen)); + else + state = aa_dfa_match_len(dfa, state, ANONYMOUS_ADDR, 1); + /* todo: could change to out of band for cleaner separation */ + state = aa_dfa_null_transition(dfa, state); + + return state; +} + +static aa_state_t match_to_local(struct aa_policydb *policy, + aa_state_t state, u32 request, + int type, int protocol, + struct sockaddr_un *addr, int addrlen, + struct aa_perms **p, + const char **info) +{ + state = aa_match_to_prot(policy, state, request, PF_UNIX, type, + protocol, NULL, info); + if (state) { + state = match_addr(policy->dfa, state, addr, addrlen); + if (state) { + /* todo: local label matching */ + state = aa_dfa_null_transition(policy->dfa, state); + if (!state) + *info = "failed local label match"; + } else { + *info = "failed local address match"; + } + } + + return state; +} + +struct sockaddr_un *aa_sunaddr(const struct unix_sock *u, int *addrlen) +{ + struct unix_address *addr; + + /* memory barrier is sufficient see note in net/unix/af_unix.c */ + addr = smp_load_acquire(&u->addr); + if (addr) { + *addrlen = addr->len; + return addr->name; + } + *addrlen = 0; + return NULL; +} + +static aa_state_t match_to_sk(struct aa_policydb *policy, + aa_state_t state, u32 request, + struct unix_sock *u, struct aa_perms **p, + const char **info) +{ + int addrlen; + struct sockaddr_un *addr = aa_sunaddr(u, &addrlen); + + return match_to_local(policy, state, request, u->sk.sk_type, + u->sk.sk_protocol, addr, addrlen, p, info); +} + +#define CMD_ADDR 1 +#define CMD_LISTEN 2 +#define CMD_OPT 4 + +static aa_state_t match_to_cmd(struct aa_policydb *policy, aa_state_t state, + u32 request, struct unix_sock *u, + char cmd, struct aa_perms **p, + const char **info) +{ + AA_BUG(!p); + + state = match_to_sk(policy, state, request, u, p, info); + if (state && !*p) { + state = aa_dfa_match_len(policy->dfa, state, &cmd, 1); + if (!state) + *info = "failed cmd selection match"; + } + + return state; +} + +static aa_state_t match_to_peer(struct aa_policydb *policy, aa_state_t state, + u32 request, struct unix_sock *u, + struct sockaddr_un *peer_addr, int peer_addrlen, + struct aa_perms **p, const char **info) +{ + AA_BUG(!p); + + state = match_to_cmd(policy, state, request, u, CMD_ADDR, p, info); + if (state && !*p) { + state = match_addr(policy->dfa, state, peer_addr, peer_addrlen); + if (!state) + *info = "failed peer address match"; + } + + return state; +} + +static aa_state_t match_label(struct aa_profile *profile, + struct aa_ruleset *rule, aa_state_t state, + u32 request, struct aa_profile *peer, + struct aa_perms *p, + struct apparmor_audit_data *ad) +{ + AA_BUG(!profile); + AA_BUG(!peer); + + ad->peer = &peer->label; + + if (state && !p) { + state = aa_dfa_match(rule->policy->dfa, state, + peer->base.hname); + if (!state) + ad->info = "failed peer label match"; + + } + + return aa_do_perms(profile, rule->policy, state, request, p, ad); +} + + +/* unix sock creation comes before we know if the socket will be an fs + * socket + * v6 - semantics are handled by mapping in profile load + * v7 - semantics require sock create for tasks creating an fs socket. + * v8 - same as v7 + */ +static int profile_create_perm(struct aa_profile *profile, int family, + int type, int protocol, + struct apparmor_audit_data *ad) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + aa_state_t state; + + AA_BUG(!profile); + AA_BUG(profile_unconfined(profile)); + + state = RULE_MEDIATES_v9NET(rules); + if (state) { + state = aa_match_to_prot(rules->policy, state, AA_MAY_CREATE, + PF_UNIX, type, protocol, NULL, + &ad->info); + + return aa_do_perms(profile, rules->policy, state, AA_MAY_CREATE, + NULL, ad); + } + + return aa_profile_af_perm(profile, ad, AA_MAY_CREATE, family, type, + protocol); +} + +static int profile_sk_perm(struct aa_profile *profile, + struct apparmor_audit_data *ad, + u32 request, struct sock *sk, struct path *path) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + struct aa_perms *p = NULL; + aa_state_t state; + + AA_BUG(!profile); + AA_BUG(!sk); + AA_BUG(profile_unconfined(profile)); + + state = RULE_MEDIATES_v9NET(rules); + if (state) { + if (is_unix_fs(sk)) + return unix_fs_perm(ad->op, request, ad->subj_cred, + &profile->label, + &unix_sk(sk)->path); + + state = match_to_sk(rules->policy, state, request, unix_sk(sk), + &p, &ad->info); + + return aa_do_perms(profile, rules->policy, state, request, p, + ad); + } + + return aa_profile_af_sk_perm(profile, ad, request, sk); +} + +static int profile_bind_perm(struct aa_profile *profile, struct sock *sk, + struct apparmor_audit_data *ad) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + struct aa_perms *p = NULL; + aa_state_t state; + + AA_BUG(!profile); + AA_BUG(!sk); + AA_BUG(!ad); + AA_BUG(profile_unconfined(profile)); + + state = RULE_MEDIATES_v9NET(rules); + if (state) { + if (is_unix_addr_fs(ad->net.addr, ad->net.addrlen)) + /* under v7-9 fs hook handles bind */ + return 0; + /* bind for abstract socket */ + state = match_to_local(rules->policy, state, AA_MAY_BIND, + sk->sk_type, sk->sk_protocol, + unix_addr(ad->net.addr), + ad->net.addrlen, + &p, &ad->info); + + return aa_do_perms(profile, rules->policy, state, AA_MAY_BIND, + p, ad); + } + + return aa_profile_af_sk_perm(profile, ad, AA_MAY_BIND, sk); +} + +static int profile_listen_perm(struct aa_profile *profile, struct sock *sk, + int backlog, struct apparmor_audit_data *ad) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + struct aa_perms *p = NULL; + aa_state_t state; + + AA_BUG(!profile); + AA_BUG(!sk); + AA_BUG(!ad); + AA_BUG(profile_unconfined(profile)); + + state = RULE_MEDIATES_v9NET(rules); + if (state) { + __be16 b = cpu_to_be16(backlog); + + if (is_unix_fs(sk)) + return unix_fs_perm(ad->op, AA_MAY_LISTEN, + ad->subj_cred, &profile->label, + &unix_sk(sk)->path); + + state = match_to_cmd(rules->policy, state, AA_MAY_LISTEN, + unix_sk(sk), CMD_LISTEN, &p, &ad->info); + if (state && !p) { + state = aa_dfa_match_len(rules->policy->dfa, state, + (char *) &b, 2); + if (!state) + ad->info = "failed listen backlog match"; + } + return aa_do_perms(profile, rules->policy, state, AA_MAY_LISTEN, + p, ad); + } + + return aa_profile_af_sk_perm(profile, ad, AA_MAY_LISTEN, sk); +} + +static int profile_accept_perm(struct aa_profile *profile, + struct sock *sk, + struct apparmor_audit_data *ad) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + struct aa_perms *p = NULL; + aa_state_t state; + + AA_BUG(!profile); + AA_BUG(!sk); + AA_BUG(!ad); + AA_BUG(profile_unconfined(profile)); + + state = RULE_MEDIATES_v9NET(rules); + if (state) { + if (is_unix_fs(sk)) + return unix_fs_perm(ad->op, AA_MAY_ACCEPT, + ad->subj_cred, &profile->label, + &unix_sk(sk)->path); + + state = match_to_sk(rules->policy, state, AA_MAY_ACCEPT, + unix_sk(sk), &p, &ad->info); + + return aa_do_perms(profile, rules->policy, state, AA_MAY_ACCEPT, + p, ad); + } + + return aa_profile_af_sk_perm(profile, ad, AA_MAY_ACCEPT, sk); +} + +static int profile_opt_perm(struct aa_profile *profile, u32 request, + struct sock *sk, int optname, + struct apparmor_audit_data *ad) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + struct aa_perms *p = NULL; + aa_state_t state; + + AA_BUG(!profile); + AA_BUG(!sk); + AA_BUG(!ad); + AA_BUG(profile_unconfined(profile)); + + state = RULE_MEDIATES_v9NET(rules); + if (state) { + __be16 b = cpu_to_be16(optname); + if (is_unix_fs(sk)) + return unix_fs_perm(ad->op, request, + ad->subj_cred, &profile->label, + &unix_sk(sk)->path); + + state = match_to_cmd(rules->policy, state, request, unix_sk(sk), + CMD_OPT, &p, &ad->info); + if (state && !p) { + state = aa_dfa_match_len(rules->policy->dfa, state, + (char *) &b, 2); + if (!state) + ad->info = "failed sockopt match"; + } + return aa_do_perms(profile, rules->policy, state, request, p, + ad); + } + + return aa_profile_af_sk_perm(profile, ad, request, sk); +} + +/* null peer_label is allowed, in which case the peer_sk label is used */ +static int profile_peer_perm(struct aa_profile *profile, u32 request, + struct sock *sk, struct path *path, + struct sockaddr_un *peer_addr, + int peer_addrlen, struct path *peer_path, + struct aa_label *peer_label, + struct apparmor_audit_data *ad) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + struct aa_perms *p = NULL; + aa_state_t state; + + AA_BUG(!profile); + AA_BUG(profile_unconfined(profile)); + AA_BUG(!sk); + AA_BUG(!peer_label); + AA_BUG(!ad); + + state = RULE_MEDIATES_v9NET(rules); + if (state) { + struct aa_profile *peerp; + + if (peer_path) + return unix_fs_perm(ad->op, request, ad->subj_cred, + &profile->label, peer_path); + else if (path) + return unix_fs_perm(ad->op, request, ad->subj_cred, + &profile->label, path); + state = match_to_peer(rules->policy, state, request, + unix_sk(sk), + peer_addr, peer_addrlen, &p, &ad->info); + + return fn_for_each_in_ns(peer_label, peerp, + match_label(profile, rules, state, request, + peerp, p, ad)); + } + + return aa_profile_af_sk_perm(profile, ad, request, sk); +} + +/* -------------------------------- */ + +int aa_unix_create_perm(struct aa_label *label, int family, int type, + int protocol) +{ + if (!unconfined(label)) { + struct aa_profile *profile; + DEFINE_AUDIT_NET(ad, OP_CREATE, current_cred(), NULL, family, + type, protocol); + + return fn_for_each_confined(label, profile, + profile_create_perm(profile, family, type, + protocol, &ad)); + } + + return 0; +} + +static int aa_unix_label_sk_perm(const struct cred *subj_cred, + struct aa_label *label, + const char *op, u32 request, struct sock *sk, + struct path *path) +{ + if (!unconfined(label)) { + struct aa_profile *profile; + DEFINE_AUDIT_SK(ad, op, subj_cred, sk); + + return fn_for_each_confined(label, profile, + profile_sk_perm(profile, &ad, request, sk, + path)); + } + return 0; +} + +/* revalidation, get/set attr, shutdown */ +int aa_unix_sock_perm(const char *op, u32 request, struct socket *sock) +{ + struct aa_label *label; + int error; + + label = begin_current_label_crit_section(); + error = aa_unix_label_sk_perm(current_cred(), label, op, + request, sock->sk, + is_unix_fs(sock->sk) ? &unix_sk(sock->sk)->path : NULL); + end_current_label_crit_section(label); + + return error; +} + +static int valid_addr(struct sockaddr *addr, int addr_len) +{ + struct sockaddr_un *sunaddr = unix_addr(addr); + + /* addr_len == offsetof(struct sockaddr_un, sun_path) is autobind */ + if (addr_len < offsetof(struct sockaddr_un, sun_path) || + addr_len > sizeof(*sunaddr)) + return -EINVAL; + return 0; +} + +int aa_unix_bind_perm(struct socket *sock, struct sockaddr *addr, + int addrlen) +{ + struct aa_profile *profile; + struct aa_label *label; + int error = 0; + + error = valid_addr(addr, addrlen); + if (error) + return error; + + label = begin_current_label_crit_section(); + /* fs bind is handled by mknod */ + if (!unconfined(label)) { + DEFINE_AUDIT_SK(ad, OP_BIND, current_cred(), sock->sk); + + ad.net.addr = unix_addr(addr); + ad.net.addrlen = addrlen; + + error = fn_for_each_confined(label, profile, + profile_bind_perm(profile, sock->sk, &ad)); + } + end_current_label_crit_section(label); + + return error; +} + +/* + * unix connections are covered by the + * - unix_stream_connect (stream) and unix_may_send hooks (dgram) + * - fs connect is handled by open + * This is just here to document this is not needed for af_unix + * +int aa_unix_connect_perm(struct socket *sock, struct sockaddr *address, + int addrlen) +{ + return 0; +} +*/ + +int aa_unix_listen_perm(struct socket *sock, int backlog) +{ + struct aa_profile *profile; + struct aa_label *label; + int error = 0; + + label = begin_current_label_crit_section(); + if (!unconfined(label)) { + DEFINE_AUDIT_SK(ad, OP_LISTEN, current_cred(), sock->sk); + + error = fn_for_each_confined(label, profile, + profile_listen_perm(profile, sock->sk, + backlog, &ad)); + } + end_current_label_crit_section(label); + + return error; +} + + +/* ability of sock to connect, not peer address binding */ +int aa_unix_accept_perm(struct socket *sock, struct socket *newsock) +{ + struct aa_profile *profile; + struct aa_label *label; + int error = 0; + + label = begin_current_label_crit_section(); + if (!unconfined(label)) { + DEFINE_AUDIT_SK(ad, OP_ACCEPT, current_cred(), sock->sk); + + error = fn_for_each_confined(label, profile, + profile_accept_perm(profile, sock->sk, &ad)); + } + end_current_label_crit_section(label); + + return error; +} + + +/* + * dgram handled by unix_may_sendmsg, right to send on stream done at connect + * could do per msg unix_stream here, but connect + socket transfer is + * sufficient. This is just here to document this is not needed for af_unix + * + * sendmsg, recvmsg +int aa_unix_msg_perm(const char *op, u32 request, struct socket *sock, + struct msghdr *msg, int size) +{ + return 0; +} +*/ + +int aa_unix_opt_perm(const char *op, u32 request, struct socket *sock, + int level, int optname) +{ + struct aa_profile *profile; + struct aa_label *label; + int error = 0; + + label = begin_current_label_crit_section(); + if (!unconfined(label)) { + DEFINE_AUDIT_SK(ad, op, current_cred(), sock->sk); + + error = fn_for_each_confined(label, profile, + profile_opt_perm(profile, request, sock->sk, + optname, &ad)); + } + end_current_label_crit_section(label); + + return error; +} + +static int unix_peer_perm(const struct cred *subj_cred, + struct aa_label *label, const char *op, u32 request, + struct sock *sk, struct path *path, + struct sockaddr_un *peer_addr, int peer_addrlen, + struct path *peer_path, struct aa_label *peer_label) +{ + struct aa_profile *profile; + DEFINE_AUDIT_SK(ad, op, subj_cred, sk); + + ad.net.peer.addr = peer_addr; + ad.net.peer.addrlen = peer_addrlen; + + return fn_for_each_confined(label, profile, + profile_peer_perm(profile, request, sk, path, + peer_addr, peer_addrlen, peer_path, + peer_label, &ad)); +} + +/** + * + * Requires: lock held on both @sk and @peer_sk + * called by unix_stream_connect, unix_may_send + */ +int aa_unix_peer_perm(const struct cred *subj_cred, + struct aa_label *label, const char *op, u32 request, + struct sock *sk, struct sock *peer_sk, + struct aa_label *peer_label) +{ + struct unix_sock *peeru = unix_sk(peer_sk); + struct unix_sock *u = unix_sk(sk); + int plen; + struct sockaddr_un *paddr = aa_sunaddr(unix_sk(peer_sk), &plen); + + AA_BUG(!label); + AA_BUG(!sk); + AA_BUG(!peer_sk); + AA_BUG(!peer_label); + + return unix_peer_perm(subj_cred, label, op, request, sk, + is_unix_fs(sk) ? &u->path : NULL, + paddr, plen, + is_unix_fs(peer_sk) ? &peeru->path : NULL, + peer_label); +} + +/* sk_plabel for comparison only */ +static void update_sk_ctx(struct sock *sk, struct aa_label *label, + struct aa_label *plabel) +{ + struct aa_label *l, *old; + struct aa_sk_ctx *ctx = aa_sock(sk); + bool update_sk; + + rcu_read_lock(); + update_sk = (plabel && + (plabel != rcu_access_pointer(ctx->peer_lastupdate) || + !aa_label_is_subset(plabel, rcu_dereference(ctx->peer)))) || + !__aa_subj_label_is_cached(label, rcu_dereference(ctx->label)); + rcu_read_unlock(); + if (!update_sk) + return; + + spin_lock(&unix_sk(sk)->lock); + old = rcu_dereference_protected(ctx->label, + lockdep_is_held(&unix_sk(sk)->lock)); + l = aa_label_merge(old, label, GFP_ATOMIC); + if (l) { + if (l != old) { + rcu_assign_pointer(ctx->label, l); + aa_put_label(old); + } else + aa_put_label(l); + } + if (plabel && rcu_access_pointer(ctx->peer_lastupdate) != plabel) { + old = rcu_dereference_protected(ctx->peer, lockdep_is_held(&unix_sk(sk)->lock)); + + if (old == plabel) { + rcu_assign_pointer(ctx->peer_lastupdate, plabel); + } else if (aa_label_is_subset(plabel, old)) { + rcu_assign_pointer(ctx->peer_lastupdate, plabel); + rcu_assign_pointer(ctx->peer, aa_get_label(plabel)); + aa_put_label(old); + } /* else race or a subset - don't update */ + } + spin_unlock(&unix_sk(sk)->lock); +} + +static void update_peer_ctx(struct sock *sk, struct aa_sk_ctx *ctx, + struct aa_label *label) +{ + struct aa_label *l, *old; + + spin_lock(&unix_sk(sk)->lock); + old = rcu_dereference_protected(ctx->peer, + lockdep_is_held(&unix_sk(sk)->lock)); + l = aa_label_merge(old, label, GFP_ATOMIC); + if (l) { + if (l != old) { + rcu_assign_pointer(ctx->peer, l); + aa_put_label(old); + } else + aa_put_label(l); + } + spin_unlock(&unix_sk(sk)->lock); +} + +/* This fn is only checked if something has changed in the security + * boundaries. Otherwise cached info off file is sufficient + */ +int aa_unix_file_perm(const struct cred *subj_cred, struct aa_label *label, + const char *op, u32 request, struct file *file) +{ + struct socket *sock = (struct socket *) file->private_data; + struct sockaddr_un *addr, *peer_addr; + int addrlen, peer_addrlen; + struct aa_label *plabel = NULL; + struct sock *peer_sk = NULL; + u32 sk_req = request & ~NET_PEER_MASK; + struct path path; + bool is_sk_fs; + int error = 0; + + AA_BUG(!label); + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(sock->sk->sk_family != PF_UNIX); + + /* investigate only using lock via unix_peer_get() + * addr only needs the memory barrier, but need to investigate + * path + */ + unix_state_lock(sock->sk); + peer_sk = unix_peer(sock->sk); + if (peer_sk) + sock_hold(peer_sk); + + is_sk_fs = is_unix_fs(sock->sk); + addr = aa_sunaddr(unix_sk(sock->sk), &addrlen); + path = unix_sk(sock->sk)->path; + unix_state_unlock(sock->sk); + + if (is_sk_fs && peer_sk) + sk_req = request; + if (sk_req) { + error = aa_unix_label_sk_perm(subj_cred, label, op, + sk_req, sock->sk, + is_sk_fs ? &path : NULL); + } + if (!peer_sk) + goto out; + + peer_addr = aa_sunaddr(unix_sk(peer_sk), &peer_addrlen); + + struct path peer_path; + + peer_path = unix_sk(peer_sk)->path; + if (!is_sk_fs && is_unix_fs(peer_sk)) { + last_error(error, + unix_fs_perm(op, request, subj_cred, label, + is_unix_fs(peer_sk) ? &peer_path : NULL)); + } else if (!is_sk_fs) { + struct aa_label *plabel; + struct aa_sk_ctx *pctx = aa_sock(peer_sk); + + rcu_read_lock(); + plabel = aa_get_label_rcu(&pctx->label); + rcu_read_unlock(); + /* no fs check of aa_unix_peer_perm because conditions above + * ensure they will never be done + */ + last_error(error, + xcheck(unix_peer_perm(subj_cred, label, op, + MAY_READ | MAY_WRITE, sock->sk, + is_sk_fs ? &path : NULL, + peer_addr, peer_addrlen, + is_unix_fs(peer_sk) ? + &peer_path : NULL, + plabel), + unix_peer_perm(file->f_cred, plabel, op, + MAY_READ | MAY_WRITE, peer_sk, + is_unix_fs(peer_sk) ? + &peer_path : NULL, + addr, addrlen, + is_sk_fs ? &path : NULL, + label))); + if (!error && !__aa_subj_label_is_cached(plabel, label)) + update_peer_ctx(peer_sk, pctx, label); + } + sock_put(peer_sk); + +out: + + /* update peer cache to latest successful perm check */ + if (error == 0) + update_sk_ctx(sock->sk, label, plabel); + aa_put_label(plabel); + + return error; +} + diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 0aef34b9609b..391a586d0557 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -43,7 +43,7 @@ * The interface is split into two main components based on their function * a securityfs component: * used for static files that are always available, and which allows - * userspace to specificy the location of the security filesystem. + * userspace to specify the location of the security filesystem. * * fns and data are prefixed with * aa_sfs_ @@ -204,7 +204,7 @@ static struct file_system_type aafs_ops = { /** * __aafs_setup_d_inode - basic inode setup for apparmorfs * @dir: parent directory for the dentry - * @dentry: dentry we are seting the inode up for + * @dentry: dentry we are setting the inode up for * @mode: permissions the file should have * @data: data to store on inode.i_private, available in open() * @link: if symlink, symlink target string @@ -612,8 +612,7 @@ static const struct file_operations aa_fs_ns_revision_fops = { static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, const char *match_str, size_t match_len) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_perms tmp = { }; aa_state_t state = DFA_NOMATCH; @@ -626,11 +625,20 @@ static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, if (state) { struct path_cond cond = { }; - tmp = *(aa_lookup_fperms(rules->file, state, &cond)); + tmp = *(aa_lookup_condperms(current_fsuid(), + rules->file, state, &cond)); } } else if (rules->policy->dfa) { if (!RULE_MEDIATES(rules, *match_str)) return; /* no change to current perms */ + /* old user space does not correctly detect dbus mediation + * support so we may get dbus policy and requests when + * the abi doesn't support it. This can cause mediation + * regressions, so explicitly test for this situation. + */ + if (*match_str == AA_CLASS_DBUS && + !RULE_MEDIATES_v9NET(rules)) + return; /* no change to current perms */ state = aa_dfa_match_len(rules->policy->dfa, rules->policy->start[0], match_str, match_len); @@ -997,7 +1005,7 @@ static int aa_sfs_seq_show(struct seq_file *seq, void *v) switch (fs_file->v_type) { case AA_SFS_TYPE_BOOLEAN: - seq_printf(seq, "%s\n", fs_file->v.boolean ? "yes" : "no"); + seq_printf(seq, "%s\n", str_yes_no(fs_file->v.boolean)); break; case AA_SFS_TYPE_STRING: seq_printf(seq, "%s\n", fs_file->v.string); @@ -1006,7 +1014,7 @@ static int aa_sfs_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%#08lx\n", fs_file->v.u64); break; default: - /* Ignore unpritable entry types. */ + /* Ignore unprintable entry types. */ break; } @@ -1152,7 +1160,7 @@ static int seq_ns_stacked_show(struct seq_file *seq, void *v) struct aa_label *label; label = begin_current_label_crit_section(); - seq_printf(seq, "%s\n", label->size > 1 ? "yes" : "no"); + seq_printf(seq, "%s\n", str_yes_no(label->size > 1)); end_current_label_crit_section(label); return 0; @@ -1175,7 +1183,7 @@ static int seq_ns_nsstacked_show(struct seq_file *seq, void *v) } } - seq_printf(seq, "%s\n", count > 1 ? "yes" : "no"); + seq_printf(seq, "%s\n", str_yes_no(count > 1)); end_current_label_crit_section(label); return 0; @@ -2244,7 +2252,7 @@ static void *p_next(struct seq_file *f, void *p, loff_t *pos) /** * p_stop - stop depth first traversal * @f: seq_file we are filling - * @p: the last profile writen + * @p: the last profile written * * Release all locking done by p_start/p_next on namespace tree */ @@ -2332,6 +2340,7 @@ static struct aa_sfs_entry aa_sfs_entry_attach[] = { static struct aa_sfs_entry aa_sfs_entry_domain[] = { AA_SFS_FILE_BOOLEAN("change_hat", 1), AA_SFS_FILE_BOOLEAN("change_hatv", 1), + AA_SFS_FILE_BOOLEAN("unconfined_allowed_children", 1), AA_SFS_FILE_BOOLEAN("change_onexec", 1), AA_SFS_FILE_BOOLEAN("change_profile", 1), AA_SFS_FILE_BOOLEAN("stack", 1), @@ -2340,6 +2349,7 @@ static struct aa_sfs_entry aa_sfs_entry_domain[] = { AA_SFS_FILE_BOOLEAN("computed_longest_left", 1), AA_SFS_DIR("attach_conditions", aa_sfs_entry_attach), AA_SFS_FILE_BOOLEAN("disconnected.path", 1), + AA_SFS_FILE_BOOLEAN("kill.signal", 1), AA_SFS_FILE_STRING("version", "1.2"), { } }; @@ -2364,7 +2374,7 @@ static struct aa_sfs_entry aa_sfs_entry_policy[] = { AA_SFS_FILE_BOOLEAN("set_load", 1), /* number of out of band transitions supported */ AA_SFS_FILE_U64("outofband", MAX_OOB_SUPPORTED), - AA_SFS_FILE_U64("permstable32_version", 1), + AA_SFS_FILE_U64("permstable32_version", 3), AA_SFS_FILE_STRING("permstable32", PERMS32STR), AA_SFS_FILE_U64("state32", 1), AA_SFS_DIR("unconfined_restrictions", aa_sfs_entry_unconfined), @@ -2384,6 +2394,11 @@ static struct aa_sfs_entry aa_sfs_entry_ns[] = { { } }; +static struct aa_sfs_entry aa_sfs_entry_dbus[] = { + AA_SFS_FILE_STRING("mask", "acquire send receive"), + { } +}; + static struct aa_sfs_entry aa_sfs_entry_query_label[] = { AA_SFS_FILE_STRING("perms", "allow deny audit quiet"), AA_SFS_FILE_BOOLEAN("data", 1), @@ -2406,6 +2421,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("domain", aa_sfs_entry_domain), AA_SFS_DIR("file", aa_sfs_entry_file), AA_SFS_DIR("network_v8", aa_sfs_entry_network), + AA_SFS_DIR("network_v9", aa_sfs_entry_networkv9), AA_SFS_DIR("mount", aa_sfs_entry_mount), AA_SFS_DIR("namespaces", aa_sfs_entry_ns), AA_SFS_FILE_U64("capability", VFS_CAP_FLAGS_MASK), @@ -2413,6 +2429,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("caps", aa_sfs_entry_caps), AA_SFS_DIR("ptrace", aa_sfs_entry_ptrace), AA_SFS_DIR("signal", aa_sfs_entry_signal), + AA_SFS_DIR("dbus", aa_sfs_entry_dbus), AA_SFS_DIR("query", aa_sfs_entry_query), AA_SFS_DIR("io_uring", aa_sfs_entry_io_uring), { } diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 73087d76f649..ac89602aa2d9 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -192,7 +192,7 @@ int aa_audit(int type, struct aa_profile *profile, aa_audit_msg(type, ad, cb); if (ad->type == AUDIT_APPARMOR_KILL) - (void)send_sig_info(SIGKILL, NULL, + (void)send_sig_info(profile->signal, NULL, ad->common.type == LSM_AUDIT_DATA_TASK && ad->common.u.tsk ? ad->common.u.tsk : current); diff --git a/security/apparmor/capability.c b/security/apparmor/capability.c index 7ca489ee1054..b9ea6bc45c1a 100644 --- a/security/apparmor/capability.c +++ b/security/apparmor/capability.c @@ -27,6 +27,7 @@ struct aa_sfs_entry aa_sfs_entry_caps[] = { AA_SFS_FILE_STRING("mask", AA_SFS_CAPS_MASK), + AA_SFS_FILE_BOOLEAN("extended", 1), { } }; @@ -68,8 +69,7 @@ static int audit_caps(struct apparmor_audit_data *ad, struct aa_profile *profile { const u64 AUDIT_CACHE_TIMEOUT_NS = 1000*1000*1000; /* 1 second */ - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct audit_cache *ent; int type = AUDIT_APPARMOR_AUTO; @@ -121,10 +121,32 @@ static int audit_caps(struct apparmor_audit_data *ad, struct aa_profile *profile static int profile_capable(struct aa_profile *profile, int cap, unsigned int opts, struct apparmor_audit_data *ad) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; + aa_state_t state; int error; + state = RULE_MEDIATES(rules, ad->class); + if (state) { + struct aa_perms perms = { }; + u32 request; + + /* caps broken into 256 x 32 bit permission chunks */ + state = aa_dfa_next(rules->policy->dfa, state, cap >> 5); + request = 1 << (cap & 0x1f); + perms = *aa_lookup_perms(rules->policy, state); + aa_apply_modes_to_perms(profile, &perms); + + if (opts & CAP_OPT_NOAUDIT) { + if (perms.complain & request) + ad->info = "optional: no audit"; + else + ad = NULL; + } + return aa_check_perms(profile, &perms, request, ad, + audit_cb); + } + + /* fallback to old caps mediation that doesn't support conditionals */ if (cap_raised(rules->caps.allow, cap) && !cap_raised(rules->caps.denied, cap)) error = 0; @@ -168,3 +190,34 @@ int aa_capable(const struct cred *subj_cred, struct aa_label *label, return error; } + +kernel_cap_t aa_profile_capget(struct aa_profile *profile) +{ + struct aa_ruleset *rules = profile->label.rules[0]; + aa_state_t state; + + state = RULE_MEDIATES(rules, AA_CLASS_CAP); + if (state) { + kernel_cap_t caps = CAP_EMPTY_SET; + int i; + + /* caps broken into up to 256, 32 bit permission chunks */ + for (i = 0; i < (CAP_LAST_CAP >> 5); i++) { + struct aa_perms perms = { }; + aa_state_t tmp; + + tmp = aa_dfa_next(rules->policy->dfa, state, i); + perms = *aa_lookup_perms(rules->policy, tmp); + aa_apply_modes_to_perms(profile, &perms); + caps.val |= ((u64)(perms.allow)) << (i * 5); + caps.val |= ((u64)(perms.complain)) << (i * 5); + } + return caps; + } + + /* fallback to old caps */ + if (COMPLAIN_MODE(profile)) + return CAP_FULL_SET; + + return rules->caps.allow; +} diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 5939bd9a9b9b..267da82afb14 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -28,6 +28,12 @@ #include "include/policy.h" #include "include/policy_ns.h" +static const char * const CONFLICTING_ATTACH_STR = "conflicting profile attachments"; +static const char * const CONFLICTING_ATTACH_STR_IX = + "conflicting profile attachments - ix fallback"; +static const char * const CONFLICTING_ATTACH_STR_UX = + "conflicting profile attachments - ux fallback"; + /** * may_change_ptraced_domain - check if can change profile on ptraced task * @to_cred: cred of task changing domain @@ -87,8 +93,7 @@ static inline aa_state_t match_component(struct aa_profile *profile, struct aa_profile *tp, bool stack, aa_state_t state) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; const char *ns_name; if (stack) @@ -125,8 +130,7 @@ static int label_compound_match(struct aa_profile *profile, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_profile *tp; struct label_it i; struct path_cond cond = { }; @@ -154,7 +158,8 @@ next: if (!state) goto fail; } - *perms = *(aa_lookup_fperms(rules->file, state, &cond)); + *perms = *(aa_lookup_condperms(current_fsuid(), rules->file, state, + &cond)); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; @@ -187,8 +192,7 @@ static int label_components_match(struct aa_profile *profile, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_profile *tp; struct label_it i; struct aa_perms tmp; @@ -209,7 +213,8 @@ static int label_components_match(struct aa_profile *profile, return 0; next: - tmp = *(aa_lookup_fperms(rules->file, state, &cond)); + tmp = *(aa_lookup_condperms(current_fsuid(), rules->file, state, + &cond)); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { @@ -218,7 +223,8 @@ next: state = match_component(profile, tp, stack, start); if (!state) goto fail; - tmp = *(aa_lookup_fperms(rules->file, state, &cond)); + tmp = *(aa_lookup_condperms(current_fsuid(), rules->file, state, + &cond)); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } @@ -323,7 +329,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, size = vfs_getxattr_alloc(&nop_mnt_idmap, d, attach->xattrs[i], &value, value_size, GFP_KERNEL); if (size >= 0) { - u32 index, perm; + struct aa_perms *perms; /* * Check the xattr presence before value. This ensure @@ -335,9 +341,8 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, /* Check xattr value */ state = aa_dfa_match_len(attach->xmatch->dfa, state, value, size); - index = ACCEPT_TABLE(attach->xmatch->dfa)[state]; - perm = attach->xmatch->perms[index].allow; - if (!(perm & MAY_EXEC)) { + perms = aa_lookup_perms(attach->xmatch, state); + if (!(perms->allow & MAY_EXEC)) { ret = -EINVAL; goto out; } @@ -415,15 +420,14 @@ restart: if (attach->xmatch->dfa) { unsigned int count; aa_state_t state; - u32 index, perm; + struct aa_perms *perms; state = aa_dfa_leftmatch(attach->xmatch->dfa, attach->xmatch->start[AA_CLASS_XMATCH], name, &count); - index = ACCEPT_TABLE(attach->xmatch->dfa)[state]; - perm = attach->xmatch->perms[index].allow; + perms = aa_lookup_perms(attach->xmatch, state); /* any accepting state means a valid match. */ - if (perm & MAY_EXEC) { + if (perms->allow & MAY_EXEC) { int ret = 0; if (count < candidate_len) @@ -484,7 +488,7 @@ restart: if (!candidate || conflict) { if (conflict) - *info = "conflicting profile attachments"; + *info = CONFLICTING_ATTACH_STR; rcu_read_unlock(); return NULL; } @@ -508,15 +512,16 @@ static const char *next_name(int xtype, const char *name) * @name: returns: name tested to find label (NOT NULL) * * Returns: refcounted label, or NULL on failure (MAYBE NULL) + * @name will always be set with the last name tried */ struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, const char **name) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_label *label = NULL; u32 xtype = xindex & AA_X_TYPE_MASK; int index = xindex & AA_X_INDEX_MASK; + const char *next; AA_BUG(!name); @@ -524,25 +529,27 @@ struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, /* TODO: move lookup parsing to unpack time so this is a straight * index into the resultant label */ - for (*name = rules->file->trans.table[index]; !label && *name; - *name = next_name(xtype, *name)) { + for (next = rules->file->trans.table[index]; next; + next = next_name(xtype, next)) { + const char *lookup = (*next == '&') ? next + 1 : next; + *name = next; if (xindex & AA_X_CHILD) { - struct aa_profile *new_profile; - /* release by caller */ - new_profile = aa_find_child(profile, *name); - if (new_profile) - label = &new_profile->label; + /* TODO: switich to parse to get stack of child */ + struct aa_profile *new = aa_find_child(profile, lookup); + + if (new) + /* release by caller */ + return &new->label; continue; } - label = aa_label_parse(&profile->label, *name, GFP_KERNEL, + label = aa_label_parse(&profile->label, lookup, GFP_KERNEL, true, false); - if (IS_ERR(label)) - label = NULL; + if (!IS_ERR_OR_NULL(label)) + /* release by caller */ + return label; } - /* released by caller */ - - return label; + return NULL; } /** @@ -564,12 +571,12 @@ static struct aa_label *x_to_label(struct aa_profile *profile, const char **lookupname, const char **info) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); struct aa_label *new = NULL; + struct aa_label *stack = NULL; struct aa_ns *ns = profile->ns; u32 xtype = xindex & AA_X_TYPE_MASK; - const char *stack = NULL; + /* Used for info checks during fallback handling */ + const char *old_info = NULL; switch (xtype) { case AA_X_NONE: @@ -578,13 +585,14 @@ static struct aa_label *x_to_label(struct aa_profile *profile, break; case AA_X_TABLE: /* TODO: fix when perm mapping done at unload */ - stack = rules->file->trans.table[xindex & AA_X_INDEX_MASK]; - if (*stack != '&') { - /* released by caller */ - new = x_table_lookup(profile, xindex, lookupname); - stack = NULL; + /* released by caller + * if null for both stack and direct want to try fallback + */ + new = x_table_lookup(profile, xindex, lookupname); + if (!new || **lookupname != '&') break; - } + stack = new; + new = NULL; fallthrough; /* to X_NAME */ case AA_X_NAME: if (xindex & AA_X_CHILD) @@ -599,17 +607,38 @@ static struct aa_label *x_to_label(struct aa_profile *profile, break; } + /* fallback transition check */ if (!new) { if (xindex & AA_X_INHERIT) { /* (p|c|n)ix - don't change profile but do * use the newest version */ - *info = "ix fallback"; + if (*info == CONFLICTING_ATTACH_STR) { + *info = CONFLICTING_ATTACH_STR_IX; + } else { + old_info = *info; + *info = "ix fallback"; + } /* no profile && no error */ new = aa_get_newest_label(&profile->label); } else if (xindex & AA_X_UNCONFINED) { new = aa_get_newest_label(ns_unconfined(profile->ns)); - *info = "ux fallback"; + if (*info == CONFLICTING_ATTACH_STR) { + *info = CONFLICTING_ATTACH_STR_UX; + } else { + old_info = *info; + *info = "ux fallback"; + } + } + /* We set old_info on the code paths above where overwriting + * could have happened, so now check if info was set by + * find_attach as well (i.e. whether we actually overwrote) + * and warn accordingly. + */ + if (old_info && old_info != CONFLICTING_ATTACH_STR) { + pr_warn_ratelimited( + "AppArmor: find_attach (from profile %s) audit info \"%s\" dropped", + profile->base.hname, old_info); } } @@ -617,12 +646,12 @@ static struct aa_label *x_to_label(struct aa_profile *profile, /* base the stack on post domain transition */ struct aa_label *base = new; - new = aa_label_parse(base, stack, GFP_KERNEL, true, false); - if (IS_ERR(new)) - new = NULL; + new = aa_label_merge(base, stack, GFP_KERNEL); + /* null on error */ aa_put_label(base); } + aa_put_label(stack); /* released by caller */ return new; } @@ -633,8 +662,7 @@ static struct aa_label *profile_transition(const struct cred *subj_cred, char *buffer, struct path_cond *cond, bool *secure_exec) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_label *new = NULL; struct aa_profile *new_profile = NULL; const char *info = NULL, *name = NULL, *target = NULL; @@ -652,7 +680,7 @@ static struct aa_label *profile_transition(const struct cred *subj_cred, if (error) { if (profile_unconfined(profile) || (profile->label.flags & FLAG_IX_ON_NAME_ERROR)) { - AA_DEBUG("name lookup ix on error"); + AA_DEBUG(DEBUG_DOMAIN, "name lookup ix on error"); error = 0; new = aa_get_newest_label(&profile->label); } @@ -663,11 +691,27 @@ static struct aa_label *profile_transition(const struct cred *subj_cred, if (profile_unconfined(profile)) { new = find_attach(bprm, profile->ns, &profile->ns->base.profiles, name, &info); + /* info set -> something unusual that we should report + * Currently this is only conflicting attachments, but other + * infos added in the future should also be logged by default + * and only excluded on a case-by-case basis + */ + if (info) { + /* Because perms is never used again after this audit + * we don't need to care about clobbering it + */ + perms.audit |= MAY_EXEC; + perms.allow |= MAY_EXEC; + /* Don't cause error if auditing fails */ + (void) aa_audit_file(subj_cred, profile, &perms, + OP_EXEC, MAY_EXEC, name, target, new, cond->uid, + info, error); + } if (new) { - AA_DEBUG("unconfined attached to new label"); + AA_DEBUG(DEBUG_DOMAIN, "unconfined attached to new label"); return new; } - AA_DEBUG("unconfined exec no attachment"); + AA_DEBUG(DEBUG_DOMAIN, "unconfined exec no attachment"); return aa_get_newest_label(&profile->label); } @@ -678,9 +722,21 @@ static struct aa_label *profile_transition(const struct cred *subj_cred, new = x_to_label(profile, bprm, name, perms.xindex, &target, &info); if (new && new->proxy == profile->label.proxy && info) { + /* Force audit on conflicting attachment fallback + * Because perms is never used again after this audit + * we don't need to care about clobbering it + */ + if (info == CONFLICTING_ATTACH_STR_IX + || info == CONFLICTING_ATTACH_STR_UX) + perms.audit |= MAY_EXEC; /* hack ix fallback - improve how this is detected */ goto audit; } else if (!new) { + if (info) { + pr_warn_ratelimited( + "AppArmor: %s (from profile %s) audit info \"%s\" dropped on missing transition", + __func__, profile->base.hname, info); + } info = "profile transition not found"; /* remove MAY_EXEC to audit as failure or complaint */ perms.allow &= ~MAY_EXEC; @@ -739,8 +795,7 @@ static int profile_onexec(const struct cred *subj_cred, char *buffer, struct path_cond *cond, bool *secure_exec) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; aa_state_t state = rules->file->start[AA_CLASS_FILE]; struct aa_perms perms = {}; const char *xname = NULL, *info = "change_profile onexec"; @@ -755,7 +810,7 @@ static int profile_onexec(const struct cred *subj_cred, /* change_profile on exec already granted */ /* * NOTE: Domain transitions from unconfined are allowed - * even when no_new_privs is set because this aways results + * even when no_new_privs is set because this always results * in a further reduction of permissions. */ return 0; @@ -766,7 +821,7 @@ static int profile_onexec(const struct cred *subj_cred, if (error) { if (profile_unconfined(profile) || (profile->label.flags & FLAG_IX_ON_NAME_ERROR)) { - AA_DEBUG("name lookup ix on error"); + AA_DEBUG(DEBUG_DOMAIN, "name lookup ix on error"); error = 0; } xname = bprm->filename; @@ -926,7 +981,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm) * * NOTE: Domain transitions from unconfined and to stacked * subsets are allowed even when no_new_privs is set because this - * aways results in a further reduction of permissions. + * always results in a further reduction of permissions. */ if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) && !unconfined(label) && @@ -1188,10 +1243,24 @@ int aa_change_hat(const char *hats[], int count, u64 token, int flags) if (task_no_new_privs(current) && !unconfined(label) && !ctx->nnp) ctx->nnp = aa_get_label(label); + /* return -EPERM when unconfined doesn't have children to avoid + * changing the traditional error code for unconfined. + */ if (unconfined(label)) { - info = "unconfined can not change_hat"; - error = -EPERM; - goto fail; + struct label_it i; + bool empty = true; + + rcu_read_lock(); + label_for_each_in_ns(i, labels_ns(label), label, profile) { + empty &= list_empty(&profile->base.profiles); + } + rcu_read_unlock(); + + if (empty) { + info = "unconfined can not change_hat"; + error = -EPERM; + goto fail; + } } if (count) { @@ -1216,7 +1285,8 @@ int aa_change_hat(const char *hats[], int count, u64 token, int flags) if (task_no_new_privs(current) && !unconfined(label) && !aa_label_is_unconfined_subset(new, ctx->nnp)) { /* not an apparmor denial per se, so don't log it */ - AA_DEBUG("no_new_privs - change_hat denied"); + AA_DEBUG(DEBUG_DOMAIN, + "no_new_privs - change_hat denied"); error = -EPERM; goto out; } @@ -1237,7 +1307,8 @@ int aa_change_hat(const char *hats[], int count, u64 token, int flags) if (task_no_new_privs(current) && !unconfined(label) && !aa_label_is_unconfined_subset(previous, ctx->nnp)) { /* not an apparmor denial per se, so don't log it */ - AA_DEBUG("no_new_privs - change_hat denied"); + AA_DEBUG(DEBUG_DOMAIN, + "no_new_privs - change_hat denied"); error = -EPERM; goto out; } @@ -1282,8 +1353,7 @@ static int change_profile_perms_wrapper(const char *op, const char *name, struct aa_label *target, bool stack, u32 request, struct aa_perms *perms) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; const char *info = NULL; int error = 0; @@ -1343,7 +1413,7 @@ int aa_change_profile(const char *fqname, int flags) if (!fqname || !*fqname) { aa_put_label(label); - AA_DEBUG("no profile name"); + AA_DEBUG(DEBUG_DOMAIN, "no profile name"); return -EINVAL; } @@ -1462,7 +1532,8 @@ check: if (task_no_new_privs(current) && !unconfined(label) && !aa_label_is_unconfined_subset(new, ctx->nnp)) { /* not an apparmor denial per se, so don't log it */ - AA_DEBUG("no_new_privs - change_hat denied"); + AA_DEBUG(DEBUG_DOMAIN, + "no_new_privs - change_hat denied"); error = -EPERM; goto out; } diff --git a/security/apparmor/file.c b/security/apparmor/file.c index f494217112c9..c75820402878 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include "include/af_unix.h" #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" @@ -168,8 +169,9 @@ static int path_name(const char *op, const struct cred *subj_cred, struct aa_perms default_perms = {}; /** - * aa_lookup_fperms - convert dfa compressed perms to internal perms - * @file_rules: the aa_policydb to lookup perms for (NOT NULL) + * aa_lookup_condperms - convert dfa compressed perms to internal perms + * @subj_uid: uid to use for subject owner test + * @rules: the aa_policydb to lookup perms for (NOT NULL) * @state: state in dfa * @cond: conditions to consider (NOT NULL) * @@ -177,18 +179,21 @@ struct aa_perms default_perms = {}; * * Returns: a pointer to a file permission set */ -struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, - aa_state_t state, struct path_cond *cond) +struct aa_perms *aa_lookup_condperms(kuid_t subj_uid, struct aa_policydb *rules, + aa_state_t state, struct path_cond *cond) { - unsigned int index = ACCEPT_TABLE(file_rules->dfa)[state]; + unsigned int index = ACCEPT_TABLE(rules->dfa)[state]; - if (!(file_rules->perms)) + if (!(rules->perms)) return &default_perms; - if (uid_eq(current_fsuid(), cond->uid)) - return &(file_rules->perms[index]); + if ((ACCEPT_TABLE2(rules->dfa)[state] & ACCEPT_FLAG_OWNER)) { + if (uid_eq(subj_uid, cond->uid)) + return &(rules->perms[index]); + return &(rules->perms[index + 1]); + } - return &(file_rules->perms[index + 1]); + return &(rules->perms[index]); } /** @@ -207,21 +212,22 @@ aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, { aa_state_t state; state = aa_dfa_match(file_rules->dfa, start, name); - *perms = *(aa_lookup_fperms(file_rules, state, cond)); + *perms = *(aa_lookup_condperms(current_fsuid(), file_rules, state, + cond)); return state; } -static int __aa_path_perm(const char *op, const struct cred *subj_cred, - struct aa_profile *profile, const char *name, - u32 request, struct path_cond *cond, int flags, - struct aa_perms *perms) +int __aa_path_perm(const char *op, const struct cred *subj_cred, + struct aa_profile *profile, const char *name, + u32 request, struct path_cond *cond, int flags, + struct aa_perms *perms) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; int e = 0; - if (profile_unconfined(profile)) + if (profile_unconfined(profile) || + ((flags & PATH_SOCK_COND) && !RULE_MEDIATES_v9NET(rules))) return 0; aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE], name, cond, perms); @@ -316,8 +322,7 @@ static int profile_path_link(const struct cred *subj_cred, const struct path *target, char *buffer2, struct path_cond *cond) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; const char *lname, *tname = NULL; struct aa_perms lperms = {}, perms; const char *info = NULL; @@ -423,9 +428,11 @@ int aa_path_link(const struct cred *subj_cred, { struct path link = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path target = { .mnt = new_dir->mnt, .dentry = old_dentry }; + struct inode *inode = d_backing_inode(old_dentry); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(target.mnt), inode); struct path_cond cond = { - d_backing_inode(old_dentry)->i_uid, - d_backing_inode(old_dentry)->i_mode + .uid = vfsuid_into_kuid(vfsuid), + .mode = inode->i_mode, }; char *buffer = NULL, *buffer2 = NULL; struct aa_profile *profile; @@ -534,22 +541,19 @@ static int __file_sock_perm(const char *op, const struct cred *subj_cred, struct aa_label *flabel, struct file *file, u32 request, u32 denied) { - struct socket *sock = (struct socket *) file->private_data; int error; - AA_BUG(!sock); - /* revalidation due to label out of date. No revocation at this time */ if (!denied && aa_label_is_subset(flabel, label)) return 0; /* TODO: improve to skip profiles cached in flabel */ - error = aa_sock_file_perm(subj_cred, label, op, request, sock); + error = aa_sock_file_perm(subj_cred, label, op, request, file); if (denied) { /* TODO: improve to skip profiles checked above */ /* check every profile in file label to is cached */ last_error(error, aa_sock_file_perm(subj_cred, flabel, op, - request, sock)); + request, file)); } if (!error) update_file_ctx(file_ctx(file), label, request); @@ -557,6 +561,35 @@ static int __file_sock_perm(const char *op, const struct cred *subj_cred, return error; } +/* for now separate fn to indicate semantics of the check */ +static bool __file_is_delegated(struct aa_label *obj_label) +{ + return unconfined(obj_label); +} + +static bool __unix_needs_revalidation(struct file *file, struct aa_label *label, + u32 request) +{ + struct socket *sock = (struct socket *) file->private_data; + + lockdep_assert_in_rcu_read_lock(); + + if (!S_ISSOCK(file_inode(file)->i_mode)) + return false; + if (request & NET_PEER_MASK) + return false; + if (sock->sk->sk_family == PF_UNIX) { + struct aa_sk_ctx *ctx = aa_sock(sock->sk); + + if (rcu_access_pointer(ctx->peer) != + rcu_access_pointer(ctx->peer_lastupdate)) + return true; + return !__aa_subj_label_is_cached(rcu_dereference(ctx->label), + label); + } + return false; +} + /** * aa_file_perm - do permission revalidation check & audit for @file * @op: operation being checked @@ -594,15 +627,16 @@ int aa_file_perm(const char *op, const struct cred *subj_cred, * delegation from unconfined tasks */ denied = request & ~fctx->allow; - if (unconfined(label) || unconfined(flabel) || - (!denied && aa_label_is_subset(flabel, label))) { + if (unconfined(label) || __file_is_delegated(flabel) || + __unix_needs_revalidation(file, label, request) || + (!denied && __aa_subj_label_is_cached(label, flabel))) { rcu_read_unlock(); goto done; } + /* slow path - revalidate access */ flabel = aa_get_newest_label(flabel); rcu_read_unlock(); - /* TODO: label cross check */ if (path_mediated_fs(file->f_path.dentry)) error = __file_path_perm(op, subj_cred, label, flabel, file, diff --git a/security/apparmor/include/af_unix.h b/security/apparmor/include/af_unix.h new file mode 100644 index 000000000000..4a62e600d82b --- /dev/null +++ b/security/apparmor/include/af_unix.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AppArmor security module + * + * This file contains AppArmor af_unix fine grained mediation + * + * Copyright 2023 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ +#ifndef __AA_AF_UNIX_H + +#include <net/af_unix.h> + +#include "label.h" + +#define unix_addr(A) ((struct sockaddr_un *)(A)) +#define unix_addr_len(L) ((L) - sizeof(sa_family_t)) +#define unix_peer(sk) (unix_sk(sk)->peer) +#define is_unix_addr_abstract_name(B) ((B)[0] == 0) +#define is_unix_addr_anon(A, L) ((A) && unix_addr_len(L) <= 0) +#define is_unix_addr_fs(A, L) (!is_unix_addr_anon(A, L) && \ + !is_unix_addr_abstract_name(unix_addr(A)->sun_path)) + +#define is_unix_anonymous(U) (!unix_sk(U)->addr) +#define is_unix_fs(U) (!is_unix_anonymous(U) && \ + unix_sk(U)->addr->name->sun_path[0]) +#define is_unix_connected(S) ((S)->state == SS_CONNECTED) + + +struct sockaddr_un *aa_sunaddr(const struct unix_sock *u, int *addrlen); +int aa_unix_peer_perm(const struct cred *subj_cred, + struct aa_label *label, const char *op, u32 request, + struct sock *sk, struct sock *peer_sk, + struct aa_label *peer_label); +int aa_unix_sock_perm(const char *op, u32 request, struct socket *sock); +int aa_unix_create_perm(struct aa_label *label, int family, int type, + int protocol); +int aa_unix_bind_perm(struct socket *sock, struct sockaddr *address, + int addrlen); +int aa_unix_connect_perm(struct socket *sock, struct sockaddr *address, + int addrlen); +int aa_unix_listen_perm(struct socket *sock, int backlog); +int aa_unix_accept_perm(struct socket *sock, struct socket *newsock); +int aa_unix_msg_perm(const char *op, u32 request, struct socket *sock, + struct msghdr *msg, int size); +int aa_unix_opt_perm(const char *op, u32 request, struct socket *sock, int level, + int optname); +int aa_unix_file_perm(const struct cred *subj_cred, struct aa_label *label, + const char *op, u32 request, struct file *file); + +#endif /* __AA_AF_UNIX_H */ diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index f83934913b0f..cc6e3df1bc62 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -28,6 +28,7 @@ #define AA_CLASS_SIGNAL 10 #define AA_CLASS_XMATCH 11 #define AA_CLASS_NET 14 +#define AA_CLASS_NETV9 15 #define AA_CLASS_LABEL 16 #define AA_CLASS_POSIX_MQUEUE 17 #define AA_CLASS_MODULE 19 @@ -38,12 +39,13 @@ #define AA_CLASS_X 31 #define AA_CLASS_DBUS 32 +/* NOTE: if AA_CLASS_LAST > 63 need to update label->mediates */ #define AA_CLASS_LAST AA_CLASS_DBUS /* Control parameters settable through module/boot flags */ extern enum audit_mode aa_g_audit; extern bool aa_g_audit_header; -extern bool aa_g_debug; +extern int aa_g_debug; extern bool aa_g_hash_policy; extern bool aa_g_export_binary; extern int aa_g_rawdata_compression_level; diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index e27229349abb..1a71a94ea19c 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -138,9 +138,12 @@ struct apparmor_audit_data { }; struct { int type, protocol; - struct sock *peer_sk; void *addr; int addrlen; + struct { + void *addr; + int addrlen; + } peer; } net; }; }; diff --git a/security/apparmor/include/capability.h b/security/apparmor/include/capability.h index d6dcc604ec0c..1ddcec2d1160 100644 --- a/security/apparmor/include/capability.h +++ b/security/apparmor/include/capability.h @@ -36,6 +36,7 @@ struct aa_caps { extern struct aa_sfs_entry aa_sfs_entry_caps[]; +kernel_cap_t aa_profile_capget(struct aa_profile *profile); int aa_capable(const struct cred *subj_cred, struct aa_label *label, int cap, unsigned int opts); diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h index 7265d2f81dd5..b028e4c13b6f 100644 --- a/security/apparmor/include/cred.h +++ b/security/apparmor/include/cred.h @@ -114,10 +114,22 @@ static inline struct aa_label *aa_get_current_label(void) return aa_get_label(l); } -#define __end_current_label_crit_section(X) end_current_label_crit_section(X) +/** + * __end_current_label_crit_section - end crit section begun with __begin_... + * @label: label obtained from __begin_current_label_crit_section + * @needput: output: bool set by __begin_current_label_crit_section + * + * Returns: label to use for this crit section + */ +static inline void __end_current_label_crit_section(struct aa_label *label, + bool needput) +{ + if (unlikely(needput)) + aa_put_label(label); +} /** - * end_label_crit_section - put a reference found with begin_current_label.. + * end_current_label_crit_section - put a reference found with begin_current_label.. * @label: label reference to put * * Should only be used with a reference obtained with @@ -132,6 +144,7 @@ static inline void end_current_label_crit_section(struct aa_label *label) /** * __begin_current_label_crit_section - current's confining label + * @needput: store whether the label needs to be put when ending crit section * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * @@ -142,13 +155,16 @@ static inline void end_current_label_crit_section(struct aa_label *label) * critical section between __begin_current_label_crit_section() .. * __end_current_label_crit_section() */ -static inline struct aa_label *__begin_current_label_crit_section(void) +static inline struct aa_label *__begin_current_label_crit_section(bool *needput) { struct aa_label *label = aa_current_raw_label(); - if (label_is_stale(label)) - label = aa_get_newest_label(label); + if (label_is_stale(label)) { + *needput = true; + return aa_get_newest_label(label); + } + *needput = false; return label; } @@ -184,10 +200,11 @@ static inline struct aa_ns *aa_get_current_ns(void) { struct aa_label *label; struct aa_ns *ns; + bool needput; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); ns = aa_get_ns(labels_ns(label)); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return ns; } diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 6e8f2aa66cd6..ef60f99bc5ae 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -77,12 +77,17 @@ int aa_audit_file(const struct cred *cred, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error); -struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, - aa_state_t state, struct path_cond *cond); +struct aa_perms *aa_lookup_condperms(kuid_t subj_uid, + struct aa_policydb *file_rules, + aa_state_t state, struct path_cond *cond); aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms); +int __aa_path_perm(const char *op, const struct cred *subj_cred, + struct aa_profile *profile, const char *name, + u32 request, struct path_cond *cond, int flags, + struct aa_perms *perms); int aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond); @@ -99,7 +104,7 @@ void aa_inherit_files(const struct cred *cred, struct files_struct *files); /** - * aa_map_file_perms - map file flags to AppArmor permissions + * aa_map_file_to_perms - map file flags to AppArmor permissions * @file: open file to map flags to AppArmor permissions * * Returns: apparmor permission set for the file diff --git a/security/apparmor/include/ipc.h b/security/apparmor/include/ipc.h index 74d17052f76b..323dd071afe9 100644 --- a/security/apparmor/include/ipc.h +++ b/security/apparmor/include/ipc.h @@ -13,6 +13,9 @@ #include <linux/sched.h> +#define SIGUNKNOWN 0 +#define MAXMAPPED_SIG 35 + int aa_may_signal(const struct cred *subj_cred, struct aa_label *sender, const struct cred *target_cred, struct aa_label *target, int sig); diff --git a/security/apparmor/include/label.h b/security/apparmor/include/label.h index 93290ae300bb..c0812dbc1b5b 100644 --- a/security/apparmor/include/label.h +++ b/security/apparmor/include/label.h @@ -19,6 +19,7 @@ #include "lib.h" struct aa_ns; +struct aa_ruleset; #define LOCAL_VEC_ENTRIES 8 #define DEFINE_VEC(T, V) \ @@ -109,7 +110,7 @@ struct label_it { int i, j; }; -/* struct aa_label - lazy labeling struct +/* struct aa_label_base - base info of label * @count: ref count of active users * @node: rbtree position * @rcu: rcu callback struct @@ -118,7 +119,10 @@ struct label_it { * @flags: stale and other flags - values may change under label set lock * @secid: secid that references this label * @size: number of entries in @ent[] - * @ent: set of profiles for label, actual size determined by @size + * @mediates: bitmask for label_mediates + * profile: label vec when embedded in a profile FLAG_PROFILE is set + * rules: variable length rules in a profile FLAG_PROFILE is set + * vec: vector of profiles comprising the compound label */ struct aa_label { struct kref count; @@ -129,7 +133,18 @@ struct aa_label { long flags; u32 secid; int size; - struct aa_profile *vec[]; + u64 mediates; + union { + struct { + /* only used is the label is a profile, size of + * rules[] is determined by the profile + * profile[1] is poison or null as guard + */ + struct aa_profile *profile[2]; + DECLARE_FLEX_ARRAY(struct aa_ruleset *, rules); + }; + DECLARE_FLEX_ARRAY(struct aa_profile *, vec); + }; }; #define last_error(E, FN) \ @@ -231,20 +246,17 @@ int aa_label_next_confined(struct aa_label *l, int i); #define fn_for_each_not_in_set(L1, L2, P, FN) \ fn_for_each2_XXX((L1), (L2), P, FN, _not_in_set) -#define LABEL_MEDIATES(L, C) \ -({ \ - struct aa_profile *profile; \ - struct label_it i; \ - int ret = 0; \ - label_for_each(i, (L), profile) { \ - if (RULE_MEDIATES(&profile->rules, (C))) { \ - ret = 1; \ - break; \ - } \ - } \ - ret; \ -}) +static inline bool label_mediates(struct aa_label *L, unsigned char C) +{ + return (L)->mediates & (((u64) 1) << (C)); +} +static inline bool label_mediates_safe(struct aa_label *L, unsigned char C) +{ + if (C > AA_CLASS_LAST) + return false; + return label_mediates(L, C); +} void aa_labelset_destroy(struct aa_labelset *ls); void aa_labelset_init(struct aa_labelset *ls); @@ -417,6 +429,13 @@ static inline void aa_put_label(struct aa_label *l) kref_put(&l->count, aa_label_kref); } +/* wrapper fn to indicate semantics of the check */ +static inline bool __aa_subj_label_is_cached(struct aa_label *subj_label, + struct aa_label *obj_label) +{ + return aa_label_is_subset(obj_label, subj_label); +} + struct aa_proxy *aa_alloc_proxy(struct aa_label *l, gfp_t gfp); void aa_proxy_kref(struct kref *kref); diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h index f11a0db7f51d..444197075fd6 100644 --- a/security/apparmor/include/lib.h +++ b/security/apparmor/include/lib.h @@ -19,22 +19,34 @@ extern struct aa_dfa *stacksplitdfa; /* - * DEBUG remains global (no per profile flag) since it is mostly used in sysctl - * which is not related to profile accesses. - */ - -#define DEBUG_ON (aa_g_debug) -/* * split individual debug cases out in preparation for finer grained * debug controls in the future. */ -#define AA_DEBUG_LABEL DEBUG_ON #define dbg_printk(__fmt, __args...) pr_debug(__fmt, ##__args) -#define AA_DEBUG(fmt, args...) \ + +#define DEBUG_NONE 0 +#define DEBUG_LABEL_ABS_ROOT 1 +#define DEBUG_LABEL 2 +#define DEBUG_DOMAIN 4 +#define DEBUG_POLICY 8 +#define DEBUG_INTERFACE 0x10 + +#define DEBUG_ALL 0x1f /* update if new DEBUG_X added */ +#define DEBUG_PARSE_ERROR (-1) + +#define DEBUG_ON (aa_g_debug != DEBUG_NONE) +#define DEBUG_ABS_ROOT (aa_g_debug & DEBUG_LABEL_ABS_ROOT) + +#define AA_DEBUG(opt, fmt, args...) \ do { \ - if (DEBUG_ON) \ - pr_debug_ratelimited("AppArmor: " fmt, ##args); \ + if (aa_g_debug & opt) \ + pr_warn_ratelimited("%s: " fmt, __func__, ##args); \ } while (0) +#define AA_DEBUG_LABEL(LAB, X, fmt, args...) \ +do { \ + if ((LAB)->flags & FLAG_DEBUG1) \ + AA_DEBUG(X, fmt, args); \ +} while (0) #define AA_WARN(X) WARN((X), "APPARMOR WARN %s: %s\n", __func__, #X) @@ -48,9 +60,16 @@ extern struct aa_dfa *stacksplitdfa; #define AA_BUG_FMT(X, fmt, args...) \ WARN((X), "AppArmor WARN %s: (" #X "): " fmt, __func__, ##args) #else -#define AA_BUG_FMT(X, fmt, args...) no_printk(fmt, ##args) +#define AA_BUG_FMT(X, fmt, args...) \ + do { \ + BUILD_BUG_ON_INVALID(X); \ + no_printk(fmt, ##args); \ + } while (0) #endif +int aa_parse_debug_params(const char *str); +int aa_print_debug_params(char *buffer); + #define AA_ERROR(fmt, args...) \ pr_err_ratelimited("AppArmor: " fmt, ##args) @@ -106,6 +125,7 @@ struct aa_str_table { }; void aa_free_str_table(struct aa_str_table *table); +bool aa_resize_str_table(struct aa_str_table *t, int newsize, gfp_t gfp); struct counted_str { struct kref count; @@ -151,7 +171,7 @@ struct aa_policy { /** * basename - find the last component of an hname - * @name: hname to find the base profile name component of (NOT NULL) + * @hname: hname to find the base profile name component of (NOT NULL) * * Returns: the tail (base profile name) name component of an hname */ @@ -281,7 +301,7 @@ __do_cleanup: \ } \ __done: \ if (!__new_) \ - AA_DEBUG("label build failed\n"); \ + AA_DEBUG(DEBUG_LABEL, "label build failed\n"); \ (__new_); \ }) diff --git a/security/apparmor/include/match.h b/security/apparmor/include/match.h index 536ce3abd598..1fbe82f5021b 100644 --- a/security/apparmor/include/match.h +++ b/security/apparmor/include/match.h @@ -17,7 +17,7 @@ #define DFA_START 1 -/** +/* * The format used for transition tables is based on the GNU flex table * file format (--tables-file option; see Table File Format in the flex * info pages and the flex sources for documentation). The magic number @@ -137,17 +137,15 @@ aa_state_t aa_dfa_matchn_until(struct aa_dfa *dfa, aa_state_t start, void aa_dfa_free_kref(struct kref *kref); -#define WB_HISTORY_SIZE 24 +/* This needs to be a power of 2 */ +#define WB_HISTORY_SIZE 32 struct match_workbuf { - unsigned int count; unsigned int pos; unsigned int len; - unsigned int size; /* power of 2, same as history size */ - unsigned int history[WB_HISTORY_SIZE]; + aa_state_t history[WB_HISTORY_SIZE]; }; #define DEFINE_MATCH_WB(N) \ struct match_workbuf N = { \ - .count = 0, \ .pos = 0, \ .len = 0, \ } diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h index c42ed8a73f1c..0d0b0ce42723 100644 --- a/security/apparmor/include/net.h +++ b/security/apparmor/include/net.h @@ -47,8 +47,9 @@ #define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT | \ AA_MAY_ACCEPT) struct aa_sk_ctx { - struct aa_label *label; - struct aa_label *peer; + struct aa_label __rcu *label; + struct aa_label __rcu *peer; + struct aa_label __rcu *peer_lastupdate; /* ptr cmp only, no deref */ }; static inline struct aa_sk_ctx *aa_sock(const struct sock *sk) @@ -56,7 +57,7 @@ static inline struct aa_sk_ctx *aa_sock(const struct sock *sk) return sk->sk_security + apparmor_blob_sizes.lbs_sock; } -#define DEFINE_AUDIT_NET(NAME, OP, SK, F, T, P) \ +#define DEFINE_AUDIT_NET(NAME, OP, CRED, SK, F, T, P) \ struct lsm_network_audit NAME ## _net = { .sk = (SK), \ .family = (F)}; \ DEFINE_AUDIT_DATA(NAME, \ @@ -65,24 +66,15 @@ static inline struct aa_sk_ctx *aa_sock(const struct sock *sk) AA_CLASS_NET, \ OP); \ NAME.common.u.net = &(NAME ## _net); \ + NAME.subj_cred = (CRED); \ NAME.net.type = (T); \ NAME.net.protocol = (P) -#define DEFINE_AUDIT_SK(NAME, OP, SK) \ - DEFINE_AUDIT_NET(NAME, OP, SK, (SK)->sk_family, (SK)->sk_type, \ +#define DEFINE_AUDIT_SK(NAME, OP, CRED, SK) \ + DEFINE_AUDIT_NET(NAME, OP, CRED, SK, (SK)->sk_family, (SK)->sk_type, \ (SK)->sk_protocol) -#define af_select(FAMILY, FN, DEF_FN) \ -({ \ - int __e; \ - switch ((FAMILY)) { \ - default: \ - __e = DEF_FN; \ - } \ - __e; \ -}) - struct aa_secmark { u8 audit; u8 deny; @@ -91,11 +83,19 @@ struct aa_secmark { }; extern struct aa_sfs_entry aa_sfs_entry_network[]; - +extern struct aa_sfs_entry aa_sfs_entry_networkv9[]; + +int aa_do_perms(struct aa_profile *profile, struct aa_policydb *policy, + aa_state_t state, u32 request, struct aa_perms *p, + struct apparmor_audit_data *ad); +/* passing in state returned by XXX_mediates_AF() */ +aa_state_t aa_match_to_prot(struct aa_policydb *policy, aa_state_t state, + u32 request, u16 af, int type, int protocol, + struct aa_perms **p, const char **info); void audit_net_cb(struct audit_buffer *ab, void *va); int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, - u32 request, u16 family, int type); + u32 request, u16 family, int type, int protocol); int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol); @@ -105,13 +105,13 @@ static inline int aa_profile_af_sk_perm(struct aa_profile *profile, struct sock *sk) { return aa_profile_af_perm(profile, ad, request, sk->sk_family, - sk->sk_type); + sk->sk_type, sk->sk_protocol); } int aa_sk_perm(const char *op, u32 request, struct sock *sk); int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, - struct socket *sock); + struct file *file); int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk); diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h index 343189903dba..8bb915d48dc7 100644 --- a/security/apparmor/include/path.h +++ b/security/apparmor/include/path.h @@ -13,6 +13,7 @@ enum path_flags { PATH_IS_DIR = 0x1, /* path is a directory */ + PATH_SOCK_COND = 0x2, PATH_CONNECT_PATH = 0x4, /* connect disconnected paths to / */ PATH_CHROOT_REL = 0x8, /* do path lookup relative to chroot */ PATH_CHROOT_NSCONNECT = 0x10, /* connect paths that are at ns root */ diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index bbaa7d39a39a..37a3781b99a0 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -101,8 +101,8 @@ extern struct aa_perms allperms; /** * aa_perms_accum_raw - accumulate perms with out masking off overlapping perms - * @accum - perms struct to accumulate into - * @addend - perms struct to add to @accum + * @accum: perms struct to accumulate into + * @addend: perms struct to add to @accum */ static inline void aa_perms_accum_raw(struct aa_perms *accum, struct aa_perms *addend) @@ -128,8 +128,8 @@ static inline void aa_perms_accum_raw(struct aa_perms *accum, /** * aa_perms_accum - accumulate perms, masking off overlapping perms - * @accum - perms struct to accumulate into - * @addend - perms struct to add to @accum + * @accum: perms struct to accumulate into + * @addend: perms struct to add to @accum */ static inline void aa_perms_accum(struct aa_perms *accum, struct aa_perms *addend) diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 757e3c232c57..4c50875c9d13 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -59,6 +59,11 @@ extern const char *const aa_profile_mode_names[]; #define on_list_rcu(X) (!list_empty(X) && (X)->prev != LIST_POISON2) +/* flags in the dfa accept2 table */ +enum dfa_accept_flags { + ACCEPT_FLAG_OWNER = 1, +}; + /* * FIXME: currently need a clean way to replace and remove profiles as a * set. It should be done at the namespace level. @@ -124,6 +129,7 @@ static inline void aa_put_pdb(struct aa_policydb *pdb) kref_put(&pdb->count, aa_pdb_free_kref); } +/* lookup perm that doesn't have and object conditional */ static inline struct aa_perms *aa_lookup_perms(struct aa_policydb *policy, aa_state_t state) { @@ -135,7 +141,6 @@ static inline struct aa_perms *aa_lookup_perms(struct aa_policydb *policy, return &(policy->perms[index]); } - /* struct aa_data - generic data structure * key: name for retrieving this data * size: size of data in bytes @@ -160,8 +165,6 @@ struct aa_data { * @secmark: secmark label match info */ struct aa_ruleset { - struct list_head list; - int size; /* TODO: merge policy and file */ @@ -175,6 +178,7 @@ struct aa_ruleset { struct aa_secmark *secmark; }; + /* struct aa_attachment - data and rules for a profiles attachment * @list: * @xmatch_str: human readable attachment string @@ -193,7 +197,6 @@ struct aa_attachment { /* struct aa_profile - basic confinement data * @base - base components of the profile (name, refcount, lists, lock ...) - * @label - label this profile is an extension of * @parent: parent of profile * @ns: namespace the profile is in * @rename: optional profile name that this profile renamed @@ -201,13 +204,20 @@ struct aa_attachment { * @audit: the auditing mode of the profile * @mode: the enforcement mode of the profile * @path_flags: flags controlling path generation behavior + * @signal: the signal that should be used when kill is used * @disconnected: what to prepend if attach_disconnected is specified * @attach: attachment rules for the profile * @rules: rules to be enforced * + * learning_cache: the accesses learned in complain mode + * raw_data: rawdata of the loaded profile policy + * hash: cryptographic hash of the profile * @dents: dentries for the profiles file entries in apparmorfs * @dirname: name of the profile dir in apparmorfs + * @dents: set of dentries associated with the profile * @data: hashtable for free-form policy aa_data + * @label - label this profile is an extension of + * @rules - label with the rule vec on its end * * The AppArmor profile contains the basic confinement data. Each profile * has a name, and exists in a namespace. The @name and @exec_match are @@ -231,16 +241,19 @@ struct aa_profile { enum audit_mode audit; long mode; u32 path_flags; + int signal; const char *disconnected; struct aa_attachment attach; - struct list_head rules; struct aa_loaddata *rawdata; unsigned char *hash; char *dirname; struct dentry *dents[AAFS_PROF_SIZEOF]; struct rhashtable *data; + + int n_rules; + /* special - variable length must be last entry in profile */ struct aa_label label; }; @@ -298,24 +311,38 @@ static inline aa_state_t RULE_MEDIATES(struct aa_ruleset *rules, rules->policy->start[0], &class, 1); } -static inline aa_state_t RULE_MEDIATES_AF(struct aa_ruleset *rules, u16 AF) +static inline aa_state_t RULE_MEDIATES_v9NET(struct aa_ruleset *rules) { - aa_state_t state = RULE_MEDIATES(rules, AA_CLASS_NET); - __be16 be_af = cpu_to_be16(AF); + return RULE_MEDIATES(rules, AA_CLASS_NETV9); +} + +static inline aa_state_t RULE_MEDIATES_NET(struct aa_ruleset *rules) +{ + /* can not use RULE_MEDIATE_v9AF here, because AF match fail + * can not be distiguished from class match fail, and we only + * fallback to checking older class on class match failure + */ + aa_state_t state = RULE_MEDIATES(rules, AA_CLASS_NETV9); + /* fallback and check v7/8 if v9 is NOT mediated */ if (!state) - return DFA_NOMATCH; - return aa_dfa_match_len(rules->policy->dfa, state, (char *) &be_af, 2); + state = RULE_MEDIATES(rules, AA_CLASS_NET); + + return state; } -static inline aa_state_t ANY_RULE_MEDIATES(struct list_head *head, - unsigned char class) + +void aa_compute_profile_mediates(struct aa_profile *profile); +static inline bool profile_mediates(struct aa_profile *profile, + unsigned char class) { - struct aa_ruleset *rule; + return label_mediates(&profile->label, class); +} - /* TODO: change to list walk */ - rule = list_first_entry(head, typeof(*rule), list); - return RULE_MEDIATES(rule, class); +static inline bool profile_mediates_safe(struct aa_profile *profile, + unsigned char class) +{ + return label_mediates_safe(&profile->label, class); } /** diff --git a/security/apparmor/include/sig_names.h b/security/apparmor/include/sig_names.h index cbf7a997ed84..c772668cdc62 100644 --- a/security/apparmor/include/sig_names.h +++ b/security/apparmor/include/sig_names.h @@ -1,9 +1,5 @@ #include <linux/signal.h> - -#define SIGUNKNOWN 0 -#define MAXMAPPED_SIG 35 -#define MAXMAPPED_SIGNAME (MAXMAPPED_SIG + 1) -#define SIGRT_BASE 128 +#include "signal.h" /* provide a mapping of arch signal to internal signal # for mediation * those that are always an alias SIGCLD for SIGCLHD and SIGPOLL for SIGIO diff --git a/security/apparmor/include/signal.h b/security/apparmor/include/signal.h new file mode 100644 index 000000000000..729763fa7ce6 --- /dev/null +++ b/security/apparmor/include/signal.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AppArmor security module + * + * This file contains AppArmor ipc mediation function definitions. + * + * Copyright 2023 Canonical Ltd. + */ + +#ifndef __AA_SIGNAL_H +#define __AA_SIGNAL_H + +#define SIGUNKNOWN 0 +#define MAXMAPPED_SIG 35 + +#define MAXMAPPED_SIGNAME (MAXMAPPED_SIG + 1) +#define SIGRT_BASE 128 + +#endif /* __AA_SIGNAL_H */ diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 0cdf4340b02d..df5712cea685 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -80,21 +80,20 @@ static int profile_signal_perm(const struct cred *cred, struct aa_label *peer, u32 request, struct apparmor_audit_data *ad) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_perms perms; aa_state_t state; - if (profile_unconfined(profile) || - !ANY_RULE_MEDIATES(&profile->rules, AA_CLASS_SIGNAL)) + if (profile_unconfined(profile)) return 0; ad->subj_cred = cred; ad->peer = peer; /* TODO: secondary cache check <profile, profile, perm> */ - state = aa_dfa_next(rules->policy->dfa, - rules->policy->start[AA_CLASS_SIGNAL], - ad->signal); + state = RULE_MEDIATES(rules, AA_CLASS_SIGNAL); + if (!state) + return 0; + state = aa_dfa_next(rules->policy->dfa, state, ad->signal); aa_label_match(profile, rules, peer, state, false, request, &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_signal_cb); diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 91483ecacc16..913678f199c3 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -198,21 +198,25 @@ static bool vec_is_stale(struct aa_profile **vec, int n) return false; } -static long accum_vec_flags(struct aa_profile **vec, int n) +static void accum_label_info(struct aa_label *new) { long u = FLAG_UNCONFINED; int i; - AA_BUG(!vec); + AA_BUG(!new); - for (i = 0; i < n; i++) { - u |= vec[i]->label.flags & (FLAG_DEBUG1 | FLAG_DEBUG2 | - FLAG_STALE); - if (!(u & vec[i]->label.flags & FLAG_UNCONFINED)) + /* size == 1 is a profile and flags must be set as part of creation */ + if (new->size == 1) + return; + + for (i = 0; i < new->size; i++) { + u |= new->vec[i]->label.flags & (FLAG_DEBUG1 | FLAG_DEBUG2 | + FLAG_STALE); + if (!(u & new->vec[i]->label.flags & FLAG_UNCONFINED)) u &= ~FLAG_UNCONFINED; + new->mediates |= new->vec[i]->label.mediates; } - - return u; + new->flags |= u; } static int sort_cmp(const void *a, const void *b) @@ -431,7 +435,7 @@ struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp) /* + 1 for null terminator entry on vec */ new = kzalloc(struct_size(new, vec, size + 1), gfp); - AA_DEBUG("%s (%p)\n", __func__, new); + AA_DEBUG(DEBUG_LABEL, "%s (%p)\n", __func__, new); if (!new) goto fail; @@ -645,6 +649,7 @@ static bool __label_replace(struct aa_label *old, struct aa_label *new) rb_replace_node(&old->node, &new->node, &ls->root); old->flags &= ~FLAG_IN_TREE; new->flags |= FLAG_IN_TREE; + accum_label_info(new); return true; } @@ -705,6 +710,7 @@ static struct aa_label *__label_insert(struct aa_labelset *ls, rb_link_node(&label->node, parent, new); rb_insert_color(&label->node, &ls->root); label->flags |= FLAG_IN_TREE; + accum_label_info(label); return aa_get_label(label); } @@ -1085,7 +1091,6 @@ static struct aa_label *label_merge_insert(struct aa_label *new, else if (k == b->size) return aa_get_label(b); } - new->flags |= accum_vec_flags(new->vec, new->size); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); label = __label_insert(labels_set(new), new, false); @@ -1456,7 +1461,7 @@ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) /* * cached label name is present and visible - * @label->hname only exists if label is namespace hierachical + * @label->hname only exists if label is namespace hierarchical */ static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label, int flags) @@ -1617,7 +1622,7 @@ int aa_label_snxprint(char *str, size_t size, struct aa_ns *ns, AA_BUG(!str && size != 0); AA_BUG(!label); - if (AA_DEBUG_LABEL && (flags & FLAG_ABS_ROOT)) { + if (DEBUG_ABS_ROOT && (flags & FLAG_ABS_ROOT)) { ns = root_ns; len = snprintf(str, size, "_"); update_for_len(total, len, size, str); @@ -1731,7 +1736,7 @@ void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, display_mode(ns, label, flags)) { len = aa_label_asxprint(&name, ns, label, flags, gfp); if (len < 0) { - AA_DEBUG("label print error"); + AA_DEBUG(DEBUG_LABEL, "label print error"); return; } str = name; @@ -1759,7 +1764,7 @@ void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { - AA_DEBUG("label print error"); + AA_DEBUG(DEBUG_LABEL, "label print error"); return; } seq_puts(f, str); @@ -1782,7 +1787,7 @@ void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { - AA_DEBUG("label print error"); + AA_DEBUG(DEBUG_LABEL, "label print error"); return; } pr_info("%s", str); @@ -1865,7 +1870,7 @@ struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, AA_BUG(!str); str = skipn_spaces(str, n); - if (str == NULL || (AA_DEBUG_LABEL && *str == '_' && + if (str == NULL || (DEBUG_ABS_ROOT && *str == '_' && base != &root_ns->unconfined->label)) return ERR_PTR(-EINVAL); diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 7db62213e352..82dbb97ad406 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -25,6 +25,120 @@ struct aa_perms allperms = { .allow = ALL_PERMS_MASK, .quiet = ALL_PERMS_MASK, .hide = ALL_PERMS_MASK }; +struct val_table_ent { + const char *str; + int value; +}; + +static struct val_table_ent debug_values_table[] = { + { "N", DEBUG_NONE }, + { "none", DEBUG_NONE }, + { "n", DEBUG_NONE }, + { "0", DEBUG_NONE }, + { "all", DEBUG_ALL }, + { "Y", DEBUG_ALL }, + { "y", DEBUG_ALL }, + { "1", DEBUG_ALL }, + { "abs_root", DEBUG_LABEL_ABS_ROOT }, + { "label", DEBUG_LABEL }, + { "domain", DEBUG_DOMAIN }, + { "policy", DEBUG_POLICY }, + { "interface", DEBUG_INTERFACE }, + { NULL, 0 } +}; + +static struct val_table_ent *val_table_find_ent(struct val_table_ent *table, + const char *name, size_t len) +{ + struct val_table_ent *entry; + + for (entry = table; entry->str != NULL; entry++) { + if (strncmp(entry->str, name, len) == 0 && + strlen(entry->str) == len) + return entry; + } + return NULL; +} + +int aa_parse_debug_params(const char *str) +{ + struct val_table_ent *ent; + const char *next; + int val = 0; + + do { + size_t n = strcspn(str, "\r\n,"); + + next = str + n; + ent = val_table_find_ent(debug_values_table, str, next - str); + if (ent) + val |= ent->value; + else + AA_DEBUG(DEBUG_INTERFACE, "unknown debug type '%.*s'", + (int)(next - str), str); + str = next + 1; + } while (*next != 0); + return val; +} + +/** + * val_mask_to_str - convert a perm mask to its short string + * @str: character buffer to store string in (at least 10 characters) + * @size: size of the @str buffer + * @table: NUL-terminated character buffer of permission characters (NOT NULL) + * @mask: permission mask to convert + */ +static int val_mask_to_str(char *str, size_t size, + const struct val_table_ent *table, u32 mask) +{ + const struct val_table_ent *ent; + int total = 0; + + for (ent = table; ent->str; ent++) { + if (ent->value && (ent->value & mask) == ent->value) { + int len = scnprintf(str, size, "%s%s", total ? "," : "", + ent->str); + size -= len; + str += len; + total += len; + mask &= ~ent->value; + } + } + + return total; +} + +int aa_print_debug_params(char *buffer) +{ + if (!aa_g_debug) + return sprintf(buffer, "N"); + return val_mask_to_str(buffer, PAGE_SIZE, debug_values_table, + aa_g_debug); +} + +bool aa_resize_str_table(struct aa_str_table *t, int newsize, gfp_t gfp) +{ + char **n; + int i; + + if (t->size == newsize) + return true; + n = kcalloc(newsize, sizeof(*n), gfp); + if (!n) + return false; + for (i = 0; i < min(t->size, newsize); i++) + n[i] = t->table[i]; + for (; i < t->size; i++) + kfree_sensitive(t->table[i]); + if (newsize > t->size) + memset(&n[t->size], 0, (newsize-t->size)*sizeof(*n)); + kfree_sensitive(t->table); + t->table = n; + t->size = newsize; + + return true; +} + /** * aa_free_str_table - free entries str table * @t: the string table to free (MAYBE NULL) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 9b6c2f157f83..8e1cc229b41b 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -26,6 +26,7 @@ #include <uapi/linux/mount.h> #include <uapi/linux/lsm.h> +#include "include/af_unix.h" #include "include/apparmor.h" #include "include/apparmorfs.h" #include "include/audit.h" @@ -126,14 +127,15 @@ static int apparmor_ptrace_access_check(struct task_struct *child, struct aa_label *tracer, *tracee; const struct cred *cred; int error; + bool needput; cred = get_task_cred(child); tracee = cred_label(cred); /* ref count on cred */ - tracer = __begin_current_label_crit_section(); + tracer = __begin_current_label_crit_section(&needput); error = aa_may_ptrace(current_cred(), tracer, cred, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); - __end_current_label_crit_section(tracer); + __end_current_label_crit_section(tracer, needput); put_cred(cred); return error; @@ -144,14 +146,15 @@ static int apparmor_ptrace_traceme(struct task_struct *parent) struct aa_label *tracer, *tracee; const struct cred *cred; int error; + bool needput; - tracee = __begin_current_label_crit_section(); + tracee = __begin_current_label_crit_section(&needput); cred = get_task_cred(parent); tracer = cred_label(cred); /* ref count on cred */ error = aa_may_ptrace(cred, tracer, current_cred(), tracee, AA_PTRACE_TRACE); put_cred(cred); - __end_current_label_crit_section(tracee); + __end_current_label_crit_section(tracee, needput); return error; } @@ -176,15 +179,11 @@ static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effec struct label_it i; label_for_each_confined(i, label, profile) { - struct aa_ruleset *rules; - if (COMPLAIN_MODE(profile)) - continue; - rules = list_first_entry(&profile->rules, - typeof(*rules), list); - *effective = cap_intersect(*effective, - rules->caps.allow); - *permitted = cap_intersect(*permitted, - rules->caps.allow); + kernel_cap_t allowed; + + allowed = aa_profile_capget(profile); + *effective = cap_intersect(*effective, allowed); + *permitted = cap_intersect(*permitted, allowed); } } rcu_read_unlock(); @@ -221,12 +220,13 @@ static int common_perm(const char *op, const struct path *path, u32 mask, { struct aa_label *label; int error = 0; + bool needput; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_path_perm(op, current_cred(), label, path, 0, mask, cond); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -524,14 +524,15 @@ static int common_file_perm(const char *op, struct file *file, u32 mask, { struct aa_label *label; int error = 0; + bool needput; /* don't reaudit files closed during inheritance */ - if (file->f_path.dentry == aa_null.dentry) + if (unlikely(file->f_path.dentry == aa_null.dentry)) return -EACCES; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -633,7 +634,7 @@ static int profile_uring(struct aa_profile *profile, u32 request, AA_BUG(!profile); - rules = list_first_entry(&profile->rules, typeof(*rules), list); + rules = profile->label.rules[0]; state = RULE_MEDIATES(rules, AA_CLASS_IO_URING); if (state) { struct aa_perms perms = { }; @@ -664,15 +665,16 @@ static int apparmor_uring_override_creds(const struct cred *new) struct aa_profile *profile; struct aa_label *label; int error; + bool needput; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_OVERRIDE); ad.uring.target = cred_label(new); - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_OVERRIDE_CRED, cred_label(new), CAP_SYS_ADMIN, &ad)); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -688,14 +690,15 @@ static int apparmor_uring_sqpoll(void) struct aa_profile *profile; struct aa_label *label; int error; + bool needput; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_SQPOLL); - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_CREATE_SQPOLL, NULL, CAP_SYS_ADMIN, &ad)); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -706,6 +709,7 @@ static int apparmor_sb_mount(const char *dev_name, const struct path *path, { struct aa_label *label; int error = 0; + bool needput; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) @@ -713,7 +717,7 @@ static int apparmor_sb_mount(const char *dev_name, const struct path *path, flags &= ~AA_MS_IGNORE_MASK; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) { if (flags & MS_REMOUNT) error = aa_remount(current_cred(), label, path, flags, @@ -732,7 +736,7 @@ static int apparmor_sb_mount(const char *dev_name, const struct path *path, error = aa_new_mount(current_cred(), label, dev_name, path, type, flags, data); } - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -742,12 +746,13 @@ static int apparmor_move_mount(const struct path *from_path, { struct aa_label *label; int error = 0; + bool needput; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_move_mount(current_cred(), label, from_path, to_path); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -756,11 +761,12 @@ static int apparmor_sb_umount(struct vfsmount *mnt, int flags) { struct aa_label *label; int error = 0; + bool needput; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_umount(current_cred(), label, mnt, flags); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -984,10 +990,12 @@ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop) { - struct aa_label *label = __begin_current_label_crit_section(); + struct aa_label *label; + bool needput; + label = __begin_current_label_crit_section(&needput); prop->apparmor.label = label; - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); } static void apparmor_task_getlsmprop_obj(struct task_struct *p, @@ -1002,13 +1010,16 @@ static void apparmor_task_getlsmprop_obj(struct task_struct *p, static int apparmor_task_setrlimit(struct task_struct *task, unsigned int resource, struct rlimit *new_rlim) { - struct aa_label *label = __begin_current_label_crit_section(); + struct aa_label *label; int error = 0; + bool needput; + + label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_task_setrlimit(current_cred(), label, task, resource, new_rlim); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return error; } @@ -1019,6 +1030,7 @@ static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo const struct cred *tc; struct aa_label *cl, *tl; int error; + bool needput; tc = get_task_cred(target); tl = aa_get_newest_cred_label(tc); @@ -1030,9 +1042,9 @@ static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); } else { - cl = __begin_current_label_crit_section(); + cl = __begin_current_label_crit_section(&needput); error = aa_may_signal(current_cred(), cl, tc, tl, sig); - __end_current_label_crit_section(cl); + __end_current_label_crit_section(cl, needput); } aa_put_label(tl); put_cred(tc); @@ -1061,12 +1073,29 @@ static int apparmor_userns_create(const struct cred *cred) return error; } +static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t gfp) +{ + struct aa_sk_ctx *ctx = aa_sock(sk); + struct aa_label *label; + bool needput; + + label = __begin_current_label_crit_section(&needput); + //spin_lock_init(&ctx->lock); + rcu_assign_pointer(ctx->label, aa_get_label(label)); + rcu_assign_pointer(ctx->peer, NULL); + rcu_assign_pointer(ctx->peer_lastupdate, NULL); + __end_current_label_crit_section(label, needput); + return 0; +} + static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); - aa_put_label(ctx->label); - aa_put_label(ctx->peer); + /* dead these won't be updated any more */ + aa_put_label(rcu_dereference_protected(ctx->label, true)); + aa_put_label(rcu_dereference_protected(ctx->peer, true)); + aa_put_label(rcu_dereference_protected(ctx->peer_lastupdate, true)); } /** @@ -1080,13 +1109,153 @@ static void apparmor_sk_clone_security(const struct sock *sk, struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_sk_ctx *new = aa_sock(newsk); - if (new->label) - aa_put_label(new->label); - new->label = aa_get_label(ctx->label); + /* not actually in use yet */ + if (rcu_access_pointer(ctx->label) != rcu_access_pointer(new->label)) { + aa_put_label(rcu_dereference_protected(new->label, true)); + rcu_assign_pointer(new->label, aa_get_label_rcu(&ctx->label)); + } + + if (rcu_access_pointer(ctx->peer) != rcu_access_pointer(new->peer)) { + aa_put_label(rcu_dereference_protected(new->peer, true)); + rcu_assign_pointer(new->peer, aa_get_label_rcu(&ctx->peer)); + } + + if (rcu_access_pointer(ctx->peer_lastupdate) != rcu_access_pointer(new->peer_lastupdate)) { + aa_put_label(rcu_dereference_protected(new->peer_lastupdate, true)); + rcu_assign_pointer(new->peer_lastupdate, + aa_get_label_rcu(&ctx->peer_lastupdate)); + } +} + +static int unix_connect_perm(const struct cred *cred, struct aa_label *label, + struct sock *sk, struct sock *peer_sk) +{ + struct aa_sk_ctx *peer_ctx = aa_sock(peer_sk); + int error; + + error = aa_unix_peer_perm(cred, label, OP_CONNECT, + (AA_MAY_CONNECT | AA_MAY_SEND | AA_MAY_RECEIVE), + sk, peer_sk, + rcu_dereference_protected(peer_ctx->label, + lockdep_is_held(&unix_sk(peer_sk)->lock))); + if (!is_unix_fs(peer_sk)) { + last_error(error, + aa_unix_peer_perm(cred, + rcu_dereference_protected(peer_ctx->label, + lockdep_is_held(&unix_sk(peer_sk)->lock)), + OP_CONNECT, + (AA_MAY_ACCEPT | AA_MAY_SEND | AA_MAY_RECEIVE), + peer_sk, sk, label)); + } + + return error; +} + +/* lockdep check in unix_connect_perm - push sks here to check */ +static void unix_connect_peers(struct aa_sk_ctx *sk_ctx, + struct aa_sk_ctx *peer_ctx) +{ + /* Cross reference the peer labels for SO_PEERSEC */ + struct aa_label *label = rcu_dereference_protected(sk_ctx->label, true); + + aa_get_label(label); + aa_put_label(rcu_dereference_protected(peer_ctx->peer, + true)); + rcu_assign_pointer(peer_ctx->peer, label); /* transfer cnt */ + + label = aa_get_label(rcu_dereference_protected(peer_ctx->label, + true)); + //spin_unlock(&peer_ctx->lock); + + //spin_lock(&sk_ctx->lock); + aa_put_label(rcu_dereference_protected(sk_ctx->peer, + true)); + aa_put_label(rcu_dereference_protected(sk_ctx->peer_lastupdate, + true)); + + rcu_assign_pointer(sk_ctx->peer, aa_get_label(label)); + rcu_assign_pointer(sk_ctx->peer_lastupdate, label); /* transfer cnt */ + //spin_unlock(&sk_ctx->lock); +} + +/** + * apparmor_unix_stream_connect - check perms before making unix domain conn + * @sk: sk attempting to connect + * @peer_sk: sk that is accepting the connection + * @newsk: new sk created for this connection + * peer is locked when this hook is called + * + * Return: + * 0 if connection is permitted + * error code on denial or failure + */ +static int apparmor_unix_stream_connect(struct sock *sk, struct sock *peer_sk, + struct sock *newsk) +{ + struct aa_sk_ctx *sk_ctx = aa_sock(sk); + struct aa_sk_ctx *peer_ctx = aa_sock(peer_sk); + struct aa_sk_ctx *new_ctx = aa_sock(newsk); + struct aa_label *label; + int error; + bool needput; + + label = __begin_current_label_crit_section(&needput); + error = unix_connect_perm(current_cred(), label, sk, peer_sk); + __end_current_label_crit_section(label, needput); + + if (error) + return error; + + /* newsk doesn't go through post_create, but does go through + * security_sk_alloc() + */ + rcu_assign_pointer(new_ctx->label, + aa_get_label(rcu_dereference_protected(peer_ctx->label, + true))); + + /* Cross reference the peer labels for SO_PEERSEC */ + unix_connect_peers(sk_ctx, new_ctx); + + return 0; +} + +/** + * apparmor_unix_may_send - check perms before conn or sending unix dgrams + * @sock: socket sending the message + * @peer: socket message is being send to + * + * Performs bidirectional permission checks for Unix domain socket communication: + * 1. Verifies sender has AA_MAY_SEND to target socket + * 2. Verifies receiver has AA_MAY_RECEIVE from source socket + * + * sock and peer are locked when this hook is called + * called by: dgram_connect peer setup but path not copied to newsk + * + * Return: + * 0 if transmission is permitted + * error code on denial or failure + */ +static int apparmor_unix_may_send(struct socket *sock, struct socket *peer) +{ + struct aa_sk_ctx *peer_ctx = aa_sock(peer->sk); + struct aa_label *label; + int error; + bool needput; + + label = __begin_current_label_crit_section(&needput); + error = xcheck(aa_unix_peer_perm(current_cred(), + label, OP_SENDMSG, AA_MAY_SEND, + sock->sk, peer->sk, + rcu_dereference_protected(peer_ctx->label, + true)), + aa_unix_peer_perm(peer->file ? peer->file->f_cred : NULL, + rcu_dereference_protected(peer_ctx->label, + true), + OP_SENDMSG, AA_MAY_RECEIVE, peer->sk, + sock->sk, label)); + __end_current_label_crit_section(label, needput); - if (new->peer) - aa_put_label(new->peer); - new->peer = aa_get_label(ctx->peer); + return error; } static int apparmor_socket_create(int family, int type, int protocol, int kern) @@ -1096,13 +1265,19 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern) AA_BUG(in_interrupt()); + if (kern) + return 0; + label = begin_current_label_crit_section(); - if (!(kern || unconfined(label))) - error = af_select(family, - create_perm(label, family, type, protocol), - aa_af_perm(current_cred(), label, - OP_CREATE, AA_MAY_CREATE, - family, type, protocol)); + if (!unconfined(label)) { + if (family == PF_UNIX) + error = aa_unix_create_perm(label, family, type, + protocol); + else + error = aa_af_perm(current_cred(), label, OP_CREATE, + AA_MAY_CREATE, family, type, + protocol); + } end_current_label_crit_section(label); return error; @@ -1135,14 +1310,58 @@ static int apparmor_socket_post_create(struct socket *sock, int family, if (sock->sk) { struct aa_sk_ctx *ctx = aa_sock(sock->sk); - aa_put_label(ctx->label); - ctx->label = aa_get_label(label); + /* still not live */ + aa_put_label(rcu_dereference_protected(ctx->label, true)); + rcu_assign_pointer(ctx->label, aa_get_label(label)); } aa_put_label(label); return 0; } +static int apparmor_socket_socketpair(struct socket *socka, + struct socket *sockb) +{ + struct aa_sk_ctx *a_ctx = aa_sock(socka->sk); + struct aa_sk_ctx *b_ctx = aa_sock(sockb->sk); + struct aa_label *label; + + /* socks not live yet - initial values set in sk_alloc */ + label = begin_current_label_crit_section(); + if (rcu_access_pointer(a_ctx->label) != label) { + AA_BUG("a_ctx != label"); + aa_put_label(rcu_dereference_protected(a_ctx->label, true)); + rcu_assign_pointer(a_ctx->label, aa_get_label(label)); + } + if (rcu_access_pointer(b_ctx->label) != label) { + AA_BUG("b_ctx != label"); + aa_put_label(rcu_dereference_protected(b_ctx->label, true)); + rcu_assign_pointer(b_ctx->label, aa_get_label(label)); + } + + if (socka->sk->sk_family == PF_UNIX) { + /* unix socket pairs by-pass unix_stream_connect */ + unix_connect_peers(a_ctx, b_ctx); + } + end_current_label_crit_section(label); + + return 0; +} + +/** + * apparmor_socket_bind - check perms before bind addr to socket + * @sock: socket to bind the address to (must be non-NULL) + * @address: address that is being bound (must be non-NULL) + * @addrlen: length of @address + * + * Performs security checks before allowing a socket to bind to an address. + * Handles Unix domain sockets specially through aa_unix_bind_perm(). + * For other socket families, uses generic permission check via aa_sk_perm(). + * + * Return: + * 0 if binding is permitted + * error code on denial or invalid parameters + */ static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { @@ -1151,9 +1370,9 @@ static int apparmor_socket_bind(struct socket *sock, AA_BUG(!address); AA_BUG(in_interrupt()); - return af_select(sock->sk->sk_family, - bind_perm(sock, address, addrlen), - aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk)); + if (sock->sk->sk_family == PF_UNIX) + return aa_unix_bind_perm(sock, address, addrlen); + return aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk); } static int apparmor_socket_connect(struct socket *sock, @@ -1164,9 +1383,10 @@ static int apparmor_socket_connect(struct socket *sock, AA_BUG(!address); AA_BUG(in_interrupt()); - return af_select(sock->sk->sk_family, - connect_perm(sock, address, addrlen), - aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk)); + /* PF_UNIX goes through unix_stream_connect && unix_may_send */ + if (sock->sk->sk_family == PF_UNIX) + return 0; + return aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk); } static int apparmor_socket_listen(struct socket *sock, int backlog) @@ -1175,9 +1395,9 @@ static int apparmor_socket_listen(struct socket *sock, int backlog) AA_BUG(!sock->sk); AA_BUG(in_interrupt()); - return af_select(sock->sk->sk_family, - listen_perm(sock, backlog), - aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk)); + if (sock->sk->sk_family == PF_UNIX) + return aa_unix_listen_perm(sock, backlog); + return aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk); } /* @@ -1191,9 +1411,9 @@ static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) AA_BUG(!newsock); AA_BUG(in_interrupt()); - return af_select(sock->sk->sk_family, - accept_perm(sock, newsock), - aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk)); + if (sock->sk->sk_family == PF_UNIX) + return aa_unix_accept_perm(sock, newsock); + return aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk); } static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, @@ -1204,9 +1424,10 @@ static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, AA_BUG(!msg); AA_BUG(in_interrupt()); - return af_select(sock->sk->sk_family, - msg_perm(op, request, sock, msg, size), - aa_sk_perm(op, request, sock->sk)); + /* PF_UNIX goes through unix_may_send */ + if (sock->sk->sk_family == PF_UNIX) + return 0; + return aa_sk_perm(op, request, sock->sk); } static int apparmor_socket_sendmsg(struct socket *sock, @@ -1228,9 +1449,9 @@ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) AA_BUG(!sock->sk); AA_BUG(in_interrupt()); - return af_select(sock->sk->sk_family, - sock_perm(op, request, sock), - aa_sk_perm(op, request, sock->sk)); + if (sock->sk->sk_family == PF_UNIX) + return aa_unix_sock_perm(op, request, sock); + return aa_sk_perm(op, request, sock->sk); } static int apparmor_socket_getsockname(struct socket *sock) @@ -1251,9 +1472,9 @@ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, AA_BUG(!sock->sk); AA_BUG(in_interrupt()); - return af_select(sock->sk->sk_family, - opt_perm(op, request, sock, level, optname), - aa_sk_perm(op, request, sock->sk)); + if (sock->sk->sk_family == PF_UNIX) + return aa_unix_opt_perm(op, request, sock, level, optname); + return aa_sk_perm(op, request, sock->sk); } static int apparmor_socket_getsockopt(struct socket *sock, int level, @@ -1289,6 +1510,7 @@ static int apparmor_socket_shutdown(struct socket *sock, int how) static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct aa_sk_ctx *ctx = aa_sock(sk); + int error; if (!skb->secmark) return 0; @@ -1297,23 +1519,31 @@ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) * If reach here before socket_post_create hook is called, in which * case label is null, drop the packet. */ - if (!ctx->label) + if (!rcu_access_pointer(ctx->label)) return -EACCES; - return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, - skb->secmark, sk); + rcu_read_lock(); + error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_RECVMSG, + AA_MAY_RECEIVE, skb->secmark, sk); + rcu_read_unlock(); + + return error; } #endif -static struct aa_label *sk_peer_label(struct sock *sk) +static struct aa_label *sk_peer_get_label(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); + struct aa_label *label = ERR_PTR(-ENOPROTOOPT); - if (ctx->peer) - return ctx->peer; + if (rcu_access_pointer(ctx->peer)) + return aa_get_label_rcu(&ctx->peer); - return ERR_PTR(-ENOPROTOOPT); + if (sk->sk_family != PF_UNIX) + return ERR_PTR(-ENOPROTOOPT); + + return label; } /** @@ -1335,19 +1565,19 @@ static int apparmor_socket_getpeersec_stream(struct socket *sock, struct aa_label *label; struct aa_label *peer; - label = begin_current_label_crit_section(); - peer = sk_peer_label(sock->sk); + peer = sk_peer_get_label(sock->sk); if (IS_ERR(peer)) { error = PTR_ERR(peer); goto done; } + label = begin_current_label_crit_section(); slen = aa_label_asxprint(&name, labels_ns(label), peer, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); /* don't include terminating \0 in slen, it breaks some apps */ if (slen < 0) { error = -ENOMEM; - goto done; + goto done_put; } if (slen > len) { error = -ERANGE; @@ -1359,8 +1589,11 @@ static int apparmor_socket_getpeersec_stream(struct socket *sock, done_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) error = -EFAULT; -done: + +done_put: end_current_label_crit_section(label); + aa_put_label(peer); +done: kfree(name); return error; } @@ -1396,8 +1629,9 @@ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) { struct aa_sk_ctx *ctx = aa_sock(sk); - if (!ctx->label) - ctx->label = aa_get_current_label(); + /* setup - not live */ + if (!rcu_access_pointer(ctx->label)) + rcu_assign_pointer(ctx->label, aa_get_current_label()); } #ifdef CONFIG_NETWORK_SECMARK @@ -1405,12 +1639,17 @@ static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb struct request_sock *req) { struct aa_sk_ctx *ctx = aa_sock(sk); + int error; if (!skb->secmark) return 0; - return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, - skb->secmark, sk); + rcu_read_lock(); + error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_CONNECT, + AA_MAY_CONNECT, skb->secmark, sk); + rcu_read_unlock(); + + return error; } #endif @@ -1467,11 +1706,16 @@ static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), + LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), + LSM_HOOK_INIT(unix_stream_connect, apparmor_unix_stream_connect), + LSM_HOOK_INIT(unix_may_send, apparmor_unix_may_send), + LSM_HOOK_INIT(socket_create, apparmor_socket_create), LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), + LSM_HOOK_INIT(socket_socketpair, apparmor_socket_socketpair), LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), @@ -1571,6 +1815,9 @@ static const struct kernel_param_ops param_ops_aalockpolicy = { .get = param_get_aalockpolicy }; +static int param_set_debug(const char *val, const struct kernel_param *kp); +static int param_get_debug(char *buffer, const struct kernel_param *kp); + static int param_set_audit(const char *val, const struct kernel_param *kp); static int param_get_audit(char *buffer, const struct kernel_param *kp); @@ -1604,8 +1851,9 @@ module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); /* Debug mode */ -bool aa_g_debug = IS_ENABLED(CONFIG_SECURITY_APPARMOR_DEBUG_MESSAGES); -module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR); +int aa_g_debug; +module_param_call(debug, param_set_debug, param_get_debug, + &aa_g_debug, 0600); /* Audit mode */ enum audit_mode aa_g_audit; @@ -1798,6 +2046,34 @@ static int param_get_aacompressionlevel(char *buffer, return param_get_int(buffer, kp); } +static int param_get_debug(char *buffer, const struct kernel_param *kp) +{ + if (!apparmor_enabled) + return -EINVAL; + if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) + return -EPERM; + return aa_print_debug_params(buffer); +} + +static int param_set_debug(const char *val, const struct kernel_param *kp) +{ + int i; + + if (!apparmor_enabled) + return -EINVAL; + if (!val) + return -EINVAL; + if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) + return -EPERM; + + i = aa_parse_debug_params(val); + if (i == DEBUG_PARSE_ERROR) + return -EINVAL; + + aa_g_debug = i; + return 0; +} + static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) @@ -2006,7 +2282,7 @@ static int __init alloc_buffers(void) * two should be enough, with more CPUs it is possible that more * buffers will be used simultaneously. The preallocated pool may grow. * This preallocation has also the side-effect that AppArmor will be - * disabled early at boot if aa_g_path_max is extremly high. + * disabled early at boot if aa_g_path_max is extremely high. */ if (num_online_cpus() > 1) num = 4 + RESERVE_COUNT; @@ -2082,6 +2358,7 @@ static unsigned int apparmor_ip_postroute(void *priv, { struct aa_sk_ctx *ctx; struct sock *sk; + int error; if (!skb->secmark) return NF_ACCEPT; @@ -2091,8 +2368,11 @@ static unsigned int apparmor_ip_postroute(void *priv, return NF_ACCEPT; ctx = aa_sock(sk); - if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, - skb->secmark, sk)) + rcu_read_lock(); + error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_SENDMSG, + AA_MAY_SEND, skb->secmark, sk); + rcu_read_unlock(); + if (!error) return NF_ACCEPT; return NF_DROP_ERR(-ECONNREFUSED); @@ -2149,12 +2429,12 @@ static int __init apparmor_nf_ip_init(void) __initcall(apparmor_nf_ip_init); #endif -static char nulldfa_src[] = { +static char nulldfa_src[] __aligned(8) = { #include "nulldfa.in" }; static struct aa_dfa *nulldfa; -static char stacksplitdfa_src[] = { +static char stacksplitdfa_src[] __aligned(8) = { #include "stacksplitdfa.in" }; struct aa_dfa *stacksplitdfa; diff --git a/security/apparmor/match.c b/security/apparmor/match.c index f2d9c57f8794..c5a91600842a 100644 --- a/security/apparmor/match.c +++ b/security/apparmor/match.c @@ -679,34 +679,35 @@ aa_state_t aa_dfa_matchn_until(struct aa_dfa *dfa, aa_state_t start, return state; } -#define inc_wb_pos(wb) \ -do { \ +#define inc_wb_pos(wb) \ +do { \ + BUILD_BUG_ON_NOT_POWER_OF_2(WB_HISTORY_SIZE); \ wb->pos = (wb->pos + 1) & (WB_HISTORY_SIZE - 1); \ - wb->len = (wb->len + 1) & (WB_HISTORY_SIZE - 1); \ + wb->len = (wb->len + 1) > WB_HISTORY_SIZE ? WB_HISTORY_SIZE : \ + wb->len + 1; \ } while (0) /* For DFAs that don't support extended tagging of states */ +/* adjust is only set if is_loop returns true */ static bool is_loop(struct match_workbuf *wb, aa_state_t state, unsigned int *adjust) { - aa_state_t pos = wb->pos; - aa_state_t i; + int pos = wb->pos; + int i; if (wb->history[pos] < state) return false; - for (i = 0; i <= wb->len; i++) { + for (i = 0; i < wb->len; i++) { if (wb->history[pos] == state) { *adjust = i; return true; } - if (pos == 0) - pos = WB_HISTORY_SIZE; - pos--; + /* -1 wraps to WB_HISTORY_SIZE - 1 */ + pos = (pos - 1) & (WB_HISTORY_SIZE - 1); } - *adjust = i; - return true; + return false; } static aa_state_t leftmatch_fb(struct aa_dfa *dfa, aa_state_t start, diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index bf8863253e07..523570aa1a5a 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -311,8 +311,7 @@ static int match_mnt_path_str(const struct cred *subj_cred, { struct aa_perms perms = { }; const char *mntpnt = NULL, *info = NULL; - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; int pos, error; AA_BUG(!profile); @@ -371,8 +370,7 @@ static int match_mnt(const struct cred *subj_cred, bool binary) { const char *devname = NULL, *info = NULL; - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; int error = -EACCES; AA_BUG(!profile); @@ -604,8 +602,7 @@ static int profile_umount(const struct cred *subj_cred, struct aa_profile *profile, const struct path *path, char *buffer) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_perms perms = { }; const char *name = NULL, *info = NULL; aa_state_t state; @@ -668,8 +665,7 @@ static struct aa_label *build_pivotroot(const struct cred *subj_cred, const struct path *old_path, char *old_buffer) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; const char *old_name, *new_name = NULL, *info = NULL; const char *trans_name = NULL; struct aa_perms perms = { }; diff --git a/security/apparmor/net.c b/security/apparmor/net.c index 77413a519117..45cf25605c34 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -8,6 +8,7 @@ * Copyright 2009-2017 Canonical Ltd. */ +#include "include/af_unix.h" #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" @@ -24,6 +25,12 @@ struct aa_sfs_entry aa_sfs_entry_network[] = { { } }; +struct aa_sfs_entry aa_sfs_entry_networkv9[] = { + AA_SFS_FILE_STRING("af_mask", AA_SFS_AF_MASK), + AA_SFS_FILE_BOOLEAN("af_unix", 1), + { } +}; + static const char * const net_mask_names[] = { "unknown", "send", @@ -66,6 +73,42 @@ static const char * const net_mask_names[] = { "unknown", }; +static void audit_unix_addr(struct audit_buffer *ab, const char *str, + struct sockaddr_un *addr, int addrlen) +{ + int len = unix_addr_len(addrlen); + + if (!addr || len <= 0) { + audit_log_format(ab, " %s=none", str); + } else if (addr->sun_path[0]) { + audit_log_format(ab, " %s=", str); + audit_log_untrustedstring(ab, addr->sun_path); + } else { + audit_log_format(ab, " %s=\"@", str); + if (audit_string_contains_control(&addr->sun_path[1], len - 1)) + audit_log_n_hex(ab, &addr->sun_path[1], len - 1); + else + audit_log_format(ab, "%.*s", len - 1, + &addr->sun_path[1]); + audit_log_format(ab, "\""); + } +} + +static void audit_unix_sk_addr(struct audit_buffer *ab, const char *str, + const struct sock *sk) +{ + const struct unix_sock *u = unix_sk(sk); + + if (u && u->addr) { + int addrlen; + struct sockaddr_un *addr = aa_sunaddr(u, &addrlen); + + audit_unix_addr(ab, str, addr, addrlen); + } else { + audit_unix_addr(ab, str, NULL, 0); + + } +} /* audit callback for net specific fields */ void audit_net_cb(struct audit_buffer *ab, void *va) @@ -73,12 +116,12 @@ void audit_net_cb(struct audit_buffer *ab, void *va) struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); - if (address_family_names[sa->u.net->family]) + if (address_family_names[ad->common.u.net->family]) audit_log_format(ab, " family=\"%s\"", - address_family_names[sa->u.net->family]); + address_family_names[ad->common.u.net->family]); else audit_log_format(ab, " family=\"unknown(%d)\"", - sa->u.net->family); + ad->common.u.net->family); if (sock_type_names[ad->net.type]) audit_log_format(ab, " sock_type=\"%s\"", sock_type_names[ad->net.type]); @@ -98,6 +141,19 @@ void audit_net_cb(struct audit_buffer *ab, void *va) net_mask_names, NET_PERMS_MASK); } } + if (ad->common.u.net->family == PF_UNIX) { + if (ad->net.addr || !ad->common.u.net->sk) + audit_unix_addr(ab, "addr", + unix_addr(ad->net.addr), + ad->net.addrlen); + else + audit_unix_sk_addr(ab, "addr", ad->common.u.net->sk); + if (ad->request & NET_PEER_MASK) { + audit_unix_addr(ab, "peer_addr", + unix_addr(ad->net.peer.addr), + ad->net.peer.addrlen); + } + } if (ad->peer) { audit_log_format(ab, " peer="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, @@ -105,45 +161,123 @@ void audit_net_cb(struct audit_buffer *ab, void *va) } } +/* standard permission lookup pattern - supports early bailout */ +int aa_do_perms(struct aa_profile *profile, struct aa_policydb *policy, + aa_state_t state, u32 request, + struct aa_perms *p, struct apparmor_audit_data *ad) +{ + struct aa_perms perms; + + AA_BUG(!profile); + AA_BUG(!policy); + + + if (state || !p) + p = aa_lookup_perms(policy, state); + perms = *p; + aa_apply_modes_to_perms(profile, &perms); + return aa_check_perms(profile, &perms, request, ad, + audit_net_cb); +} + +/* only continue match if + * insufficient current perms at current state + * indicates there are more perms in later state + * Returns: perms struct if early match + */ +static struct aa_perms *early_match(struct aa_policydb *policy, + aa_state_t state, u32 request) +{ + struct aa_perms *p; + + p = aa_lookup_perms(policy, state); + if (((p->allow & request) != request) && (p->allow & AA_CONT_MATCH)) + return NULL; + return p; +} + +static aa_state_t aa_dfa_match_be16(struct aa_dfa *dfa, aa_state_t state, + u16 data) +{ + __be16 buffer = cpu_to_be16(data); + + return aa_dfa_match_len(dfa, state, (char *) &buffer, 2); +} + +/** + * aa_match_to_prot - match the af, type, protocol triplet + * @policy: policy being matched + * @state: state to start in + * @request: permissions being requested, ignored if @p == NULL + * @af: socket address family + * @type: socket type + * @protocol: socket protocol + * @p: output - pointer to permission associated with match + * @info: output - pointer to string describing failure + * + * RETURNS: state match stopped in. + * + * If @(p) is assigned a value the returned state will be the + * corresponding state. Will not set @p on failure or if match completes + * only if an early match occurs + */ +aa_state_t aa_match_to_prot(struct aa_policydb *policy, aa_state_t state, + u32 request, u16 af, int type, int protocol, + struct aa_perms **p, const char **info) +{ + state = aa_dfa_match_be16(policy->dfa, state, (u16)af); + if (!state) { + *info = "failed af match"; + return state; + } + state = aa_dfa_match_be16(policy->dfa, state, (u16)type); + if (state) { + if (p) + *p = early_match(policy, state, request); + if (!p || !*p) { + state = aa_dfa_match_be16(policy->dfa, state, (u16)protocol); + if (!state) + *info = "failed protocol match"; + } + } else { + *info = "failed type match"; + } + + return state; +} + /* Generic af perm */ int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, - int type) + int type, int protocol) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); - struct aa_perms perms = { }; + struct aa_ruleset *rules = profile->label.rules[0]; + struct aa_perms *p = NULL; aa_state_t state; - __be16 buffer[2]; AA_BUG(family >= AF_MAX); AA_BUG(type < 0 || type >= SOCK_MAX); + AA_BUG(profile_unconfined(profile)); if (profile_unconfined(profile)) return 0; - state = RULE_MEDIATES(rules, AA_CLASS_NET); + state = RULE_MEDIATES_NET(rules); if (!state) return 0; - - buffer[0] = cpu_to_be16(family); - buffer[1] = cpu_to_be16((u16) type); - state = aa_dfa_match_len(rules->policy->dfa, state, (char *) &buffer, - 4); - perms = *aa_lookup_perms(rules->policy, state); - aa_apply_modes_to_perms(profile, &perms); - - return aa_check_perms(profile, &perms, request, ad, audit_net_cb); + state = aa_match_to_prot(rules->policy, state, request, family, type, + protocol, &p, &ad->info); + return aa_do_perms(profile, rules->policy, state, request, p, ad); } int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol) { struct aa_profile *profile; - DEFINE_AUDIT_NET(ad, op, NULL, family, type, protocol); + DEFINE_AUDIT_NET(ad, op, subj_cred, NULL, family, type, protocol); return fn_for_each_confined(label, profile, aa_profile_af_perm(profile, &ad, request, family, - type)); + type, protocol)); } static int aa_label_sk_perm(const struct cred *subj_cred, @@ -157,9 +291,9 @@ static int aa_label_sk_perm(const struct cred *subj_cred, AA_BUG(!label); AA_BUG(!sk); - if (ctx->label != kernel_t && !unconfined(label)) { + if (rcu_access_pointer(ctx->label) != kernel_t && !unconfined(label)) { struct aa_profile *profile; - DEFINE_AUDIT_SK(ad, op, sk); + DEFINE_AUDIT_SK(ad, op, subj_cred, sk); ad.subj_cred = subj_cred; error = fn_for_each_confined(label, profile, @@ -187,12 +321,16 @@ int aa_sk_perm(const char *op, u32 request, struct sock *sk) int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, - const char *op, u32 request, struct socket *sock) + const char *op, u32 request, struct file *file) { + struct socket *sock = (struct socket *) file->private_data; + AA_BUG(!label); AA_BUG(!sock); AA_BUG(!sock->sk); + if (sock->sk->sk_family == PF_UNIX) + return aa_unix_file_perm(subj_cred, label, op, request, file); return aa_label_sk_perm(subj_cred, label, op, request, sock->sk); } @@ -223,8 +361,7 @@ static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, { int i, ret; struct aa_perms perms = { }; - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; if (rules->secmark_count == 0) return 0; @@ -257,7 +394,7 @@ int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk) { struct aa_profile *profile; - DEFINE_AUDIT_SK(ad, op, sk); + DEFINE_AUDIT_SK(ad, op, NULL, sk); return fn_for_each_confined(label, profile, aa_secmark_perm(profile, request, secid, diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index d0244fab0653..50d5345ff5cb 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -243,6 +243,9 @@ static void free_ruleset(struct aa_ruleset *rules) { int i; + if (!rules) + return; + aa_put_pdb(rules->file); aa_put_pdb(rules->policy); aa_free_cap_rules(&rules->caps); @@ -259,8 +262,6 @@ struct aa_ruleset *aa_alloc_ruleset(gfp_t gfp) struct aa_ruleset *rules; rules = kzalloc(sizeof(*rules), gfp); - if (rules) - INIT_LIST_HEAD(&rules->list); return rules; } @@ -277,10 +278,9 @@ struct aa_ruleset *aa_alloc_ruleset(gfp_t gfp) */ void aa_free_profile(struct aa_profile *profile) { - struct aa_ruleset *rule, *tmp; struct rhashtable *rht; - AA_DEBUG("%s(%p)\n", __func__, profile); + AA_DEBUG(DEBUG_POLICY, "%s(%p)\n", __func__, profile); if (!profile) return; @@ -299,10 +299,9 @@ void aa_free_profile(struct aa_profile *profile) * at this point there are no tasks that can have a reference * to rules */ - list_for_each_entry_safe(rule, tmp, &profile->rules, list) { - list_del_init(&rule->list); - free_ruleset(rule); - } + for (int i = 0; i < profile->n_rules; i++) + free_ruleset(profile->label.rules[i]); + kfree_sensitive(profile->dirname); if (profile->data) { @@ -331,10 +330,12 @@ struct aa_profile *aa_alloc_profile(const char *hname, struct aa_proxy *proxy, gfp_t gfp) { struct aa_profile *profile; - struct aa_ruleset *rules; - /* freed by free_profile - usually through aa_put_profile */ - profile = kzalloc(struct_size(profile, label.vec, 2), gfp); + /* freed by free_profile - usually through aa_put_profile + * this adds space for a single ruleset in the rules section of the + * label + */ + profile = kzalloc(struct_size(profile, label.rules, 1), gfp); if (!profile) return NULL; @@ -343,13 +344,11 @@ struct aa_profile *aa_alloc_profile(const char *hname, struct aa_proxy *proxy, if (!aa_label_init(&profile->label, 1, gfp)) goto fail; - INIT_LIST_HEAD(&profile->rules); - /* allocate the first ruleset, but leave it empty */ - rules = aa_alloc_ruleset(gfp); - if (!rules) + profile->label.rules[0] = aa_alloc_ruleset(gfp); + if (!profile->label.rules[0]) goto fail; - list_add(&rules->list, &profile->rules); + profile->n_rules = 1; /* update being set needed by fs interface */ if (!proxy) { @@ -364,6 +363,7 @@ struct aa_profile *aa_alloc_profile(const char *hname, struct aa_proxy *proxy, profile->label.flags |= FLAG_PROFILE; profile->label.vec[0] = profile; + profile->signal = SIGKILL; /* refcount released by caller */ return profile; @@ -373,6 +373,41 @@ fail: return NULL; } +static inline bool ANY_RULE_MEDIATES(struct aa_profile *profile, + unsigned char class) +{ + int i; + + for (i = 0; i < profile->n_rules; i++) { + if (RULE_MEDIATES(profile->label.rules[i], class)) + return true; + } + return false; +} + +/* set of rules that are mediated by unconfined */ +static int unconfined_mediates[] = { AA_CLASS_NS, AA_CLASS_IO_URING, 0 }; + +/* must be called after profile rulesets and start information is setup */ +void aa_compute_profile_mediates(struct aa_profile *profile) +{ + int c; + + if (profile_unconfined(profile)) { + int *pos; + + for (pos = unconfined_mediates; *pos; pos++) { + if (ANY_RULE_MEDIATES(profile, *pos)) + profile->label.mediates |= ((u64) 1) << AA_CLASS_NS; + } + return; + } + for (c = 0; c <= AA_CLASS_LAST; c++) { + if (ANY_RULE_MEDIATES(profile, c)) + profile->label.mediates |= ((u64) 1) << c; + } +} + /* TODO: profile accounting - setup in remove */ /** @@ -463,7 +498,7 @@ static struct aa_policy *__lookup_parent(struct aa_ns *ns, } /** - * __create_missing_ancestors - create place holders for missing ancestores + * __create_missing_ancestors - create place holders for missing ancestors * @ns: namespace to lookup profile in (NOT NULL) * @hname: hierarchical profile name to find parent of (NOT NULL) * @gfp: type of allocation. @@ -621,13 +656,15 @@ struct aa_profile *aa_alloc_null(struct aa_profile *parent, const char *name, /* TODO: ideally we should inherit abi from parent */ profile->label.flags |= FLAG_NULL; profile->attach.xmatch = aa_get_pdb(nullpdb); - rules = list_first_entry(&profile->rules, typeof(*rules), list); + rules = profile->label.rules[0]; rules->file = aa_get_pdb(nullpdb); rules->policy = aa_get_pdb(nullpdb); + aa_compute_profile_mediates(profile); if (parent) { profile->path_flags = parent->path_flags; - + /* override/inherit what is mediated from parent */ + profile->label.mediates = parent->label.mediates; /* released on free_profile */ rcu_assign_pointer(profile->parent, aa_get_profile(parent)); profile->ns = aa_get_ns(parent->ns); @@ -833,8 +870,8 @@ bool aa_policy_admin_capable(const struct cred *subj_cred, bool capable = policy_ns_capable(subj_cred, label, user_ns, CAP_MAC_ADMIN) == 0; - AA_DEBUG("cap_mac_admin? %d\n", capable); - AA_DEBUG("policy locked? %d\n", aa_g_lock_policy); + AA_DEBUG(DEBUG_POLICY, "cap_mac_admin? %d\n", capable); + AA_DEBUG(DEBUG_POLICY, "policy locked? %d\n", aa_g_lock_policy); return aa_policy_view_capable(subj_cred, label, ns) && capable && !aa_g_lock_policy; @@ -843,11 +880,11 @@ bool aa_policy_admin_capable(const struct cred *subj_cred, bool aa_current_policy_view_capable(struct aa_ns *ns) { struct aa_label *label; - bool res; + bool needput, res; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); res = aa_policy_view_capable(current_cred(), label, ns); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return res; } @@ -855,11 +892,11 @@ bool aa_current_policy_view_capable(struct aa_ns *ns) bool aa_current_policy_admin_capable(struct aa_ns *ns) { struct aa_label *label; - bool res; + bool needput, res; - label = __begin_current_label_crit_section(); + label = __begin_current_label_crit_section(&needput); res = aa_policy_admin_capable(current_cred(), label, ns); - __end_current_label_crit_section(label); + __end_current_label_crit_section(label, needput); return res; } @@ -1068,7 +1105,7 @@ ssize_t aa_replace_profiles(struct aa_ns *policy_ns, struct aa_label *label, goto out; /* ensure that profiles are all for the same ns - * TODO: update locking to remove this constaint. All profiles in + * TODO: update locking to remove this constraint. All profiles in * the load set must succeed as a set or the load will * fail. Sort ent list and take ns locks in hierarchy order */ diff --git a/security/apparmor/policy_compat.c b/security/apparmor/policy_compat.c index 423227670e68..cfc2207e5a12 100644 --- a/security/apparmor/policy_compat.c +++ b/security/apparmor/policy_compat.c @@ -286,10 +286,10 @@ static void remap_dfa_accept(struct aa_dfa *dfa, unsigned int factor) AA_BUG(!dfa); - for (state = 0; state < state_count; state++) + for (state = 0; state < state_count; state++) { ACCEPT_TABLE(dfa)[state] = state * factor; - kvfree(dfa->tables[YYTD_ID_ACCEPT2]); - dfa->tables[YYTD_ID_ACCEPT2] = NULL; + ACCEPT_TABLE2(dfa)[state] = factor > 1 ? ACCEPT_FLAG_OWNER : 0; + } } /* TODO: merge different dfa mappings into single map_policy fn */ diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c index 1f02cfe1d974..64783ca3b0f2 100644 --- a/security/apparmor/policy_ns.c +++ b/security/apparmor/policy_ns.c @@ -107,7 +107,7 @@ static struct aa_ns *alloc_ns(const char *prefix, const char *name) struct aa_ns *ns; ns = kzalloc(sizeof(*ns), GFP_KERNEL); - AA_DEBUG("%s(%p)\n", __func__, ns); + AA_DEBUG(DEBUG_POLICY, "%s(%p)\n", __func__, ns); if (!ns) return NULL; if (!aa_policy_init(&ns->base, prefix, name, GFP_KERNEL)) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 992b74c50d64..7523971e37d9 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -29,6 +29,7 @@ #include "include/policy.h" #include "include/policy_unpack.h" #include "include/policy_compat.h" +#include "include/signal.h" /* audit callback for unpack fields */ static void audit_cb(struct audit_buffer *ab, void *va) @@ -598,8 +599,8 @@ static bool unpack_secmark(struct aa_ext *e, struct aa_ruleset *rules) fail: if (rules->secmark) { for (i = 0; i < size; i++) - kfree(rules->secmark[i].label); - kfree(rules->secmark); + kfree_sensitive(rules->secmark[i].label); + kfree_sensitive(rules->secmark); rules->secmark_count = 0; rules->secmark = NULL; } @@ -716,6 +717,7 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy, void *pos = e->pos; int i, flags, error = -EPROTO; ssize_t size; + u32 version = 0; pdb = aa_alloc_pdb(GFP_KERNEL); if (!pdb) @@ -733,6 +735,9 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy, if (pdb->perms) { /* perms table present accept is index */ flags = TO_ACCEPT1_FLAG(YYTD_DATA32); + if (aa_unpack_u32(e, &version, "permsv") && version > 2) + /* accept2 used for dfa flags */ + flags |= TO_ACCEPT2_FLAG(YYTD_DATA32); } else { /* packed perms in accept1 and accept2 */ flags = TO_ACCEPT1_FLAG(YYTD_DATA32) | @@ -770,6 +775,21 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy, } } + /* accept2 is in some cases being allocated, even with perms */ + if (pdb->perms && !pdb->dfa->tables[YYTD_ID_ACCEPT2]) { + /* add dfa flags table missing in v2 */ + u32 noents = pdb->dfa->tables[YYTD_ID_ACCEPT]->td_lolen; + u16 tdflags = pdb->dfa->tables[YYTD_ID_ACCEPT]->td_flags; + size_t tsize = table_size(noents, tdflags); + + pdb->dfa->tables[YYTD_ID_ACCEPT2] = kvzalloc(tsize, GFP_KERNEL); + if (!pdb->dfa->tables[YYTD_ID_ACCEPT2]) { + *info = "failed to alloc dfa flags table"; + goto out; + } + pdb->dfa->tables[YYTD_ID_ACCEPT2]->td_lolen = noents; + pdb->dfa->tables[YYTD_ID_ACCEPT2]->td_flags = tdflags; + } /* * Unfortunately due to a bug in earlier userspaces, a * transition table may be present even when the dfa is @@ -783,9 +803,13 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy, if (!pdb->dfa && pdb->trans.table) aa_free_str_table(&pdb->trans); - /* TODO: move compat mapping here, requires dfa merging first */ - /* TODO: move verify here, it has to be done after compat mappings */ - + /* TODO: + * - move compat mapping here, requires dfa merging first + * - move verify here, it has to be done after compat mappings + * - move free of unneeded trans table here, has to be done + * after perm mapping. + */ +out: *policy = pdb; return 0; @@ -862,7 +886,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) error = -ENOMEM; goto fail; } - rules = list_first_entry(&profile->rules, typeof(*rules), list); + rules = profile->label.rules[0]; /* profile renaming is optional */ (void) aa_unpack_str(e, &profile->rename, "rename"); @@ -898,6 +922,12 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) (void) aa_unpack_strdup(e, &disconnected, "disconnected"); profile->disconnected = disconnected; + /* optional */ + (void) aa_unpack_u32(e, &profile->signal, "kill"); + if (profile->signal < 1 || profile->signal > MAXMAPPED_SIG) { + info = "profile kill.signal invalid value"; + goto fail; + } /* per profile debug flags (complain, audit) */ if (!aa_unpack_nameX(e, AA_STRUCT, "flags")) { info = "profile missing flags"; @@ -1101,6 +1131,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } + aa_compute_profile_mediates(profile); + return profile; fail: @@ -1215,21 +1247,32 @@ static bool verify_perm(struct aa_perms *perm) static bool verify_perms(struct aa_policydb *pdb) { int i; + int xidx, xmax = -1; for (i = 0; i < pdb->size; i++) { if (!verify_perm(&pdb->perms[i])) return false; /* verify indexes into str table */ - if ((pdb->perms[i].xindex & AA_X_TYPE_MASK) == AA_X_TABLE && - (pdb->perms[i].xindex & AA_X_INDEX_MASK) >= pdb->trans.size) - return false; + if ((pdb->perms[i].xindex & AA_X_TYPE_MASK) == AA_X_TABLE) { + xidx = pdb->perms[i].xindex & AA_X_INDEX_MASK; + if (xidx >= pdb->trans.size) + return false; + if (xmax < xidx) + xmax = xidx; + } if (pdb->perms[i].tag && pdb->perms[i].tag >= pdb->trans.size) return false; if (pdb->perms[i].label && pdb->perms[i].label >= pdb->trans.size) return false; } - + /* deal with incorrectly constructed string tables */ + if (xmax == -1) { + aa_free_str_table(&pdb->trans); + } else if (pdb->trans.size > xmax + 1) { + if (!aa_resize_str_table(&pdb->trans, xmax + 1, GFP_KERNEL)) + return false; + } return true; } @@ -1243,8 +1286,8 @@ static bool verify_perms(struct aa_policydb *pdb) */ static int verify_profile(struct aa_profile *profile) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; + if (!rules) return 0; diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index 5b2ba88ae9e2..cf18744dafe2 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -9,6 +9,8 @@ #include "include/policy.h" #include "include/policy_unpack.h" +#include <linux/unaligned.h> + #define TEST_STRING_NAME "TEST_STRING" #define TEST_STRING_DATA "testing" #define TEST_STRING_BUF_OFFSET \ @@ -80,7 +82,7 @@ static struct aa_ext *build_aa_ext_struct(struct policy_unpack_fixture *puf, *(buf + 1) = strlen(TEST_U32_NAME) + 1; strscpy(buf + 3, TEST_U32_NAME, e->end - (void *)(buf + 3)); *(buf + 3 + strlen(TEST_U32_NAME) + 1) = AA_U32; - *((__le32 *)(buf + 3 + strlen(TEST_U32_NAME) + 2)) = cpu_to_le32(TEST_U32_DATA); + put_unaligned_le32(TEST_U32_DATA, buf + 3 + strlen(TEST_U32_NAME) + 2); buf = e->start + TEST_NAMED_U64_BUF_OFFSET; *buf = AA_NAME; @@ -103,7 +105,7 @@ static struct aa_ext *build_aa_ext_struct(struct policy_unpack_fixture *puf, *(buf + 1) = strlen(TEST_ARRAY_NAME) + 1; strscpy(buf + 3, TEST_ARRAY_NAME, e->end - (void *)(buf + 3)); *(buf + 3 + strlen(TEST_ARRAY_NAME) + 1) = AA_ARRAY; - *((__le16 *)(buf + 3 + strlen(TEST_ARRAY_NAME) + 2)) = cpu_to_le16(TEST_ARRAY_SIZE); + put_unaligned_le16(TEST_ARRAY_SIZE, buf + 3 + strlen(TEST_ARRAY_NAME) + 2); return e; } diff --git a/security/apparmor/procattr.c b/security/apparmor/procattr.c index e3857e3d7c6c..ce40f15d4952 100644 --- a/security/apparmor/procattr.c +++ b/security/apparmor/procattr.c @@ -125,12 +125,14 @@ int aa_setprocattr_changehat(char *args, size_t size, int flags) for (count = 0; (hat < end) && count < 16; ++count) { char *next = hat + strlen(hat) + 1; hats[count] = hat; - AA_DEBUG("%s: (pid %d) Magic 0x%llx count %d hat '%s'\n" + AA_DEBUG(DEBUG_DOMAIN, + "%s: (pid %d) Magic 0x%llx count %d hat '%s'\n" , __func__, current->pid, token, count, hat); hat = next; } } else - AA_DEBUG("%s: (pid %d) Magic 0x%llx count %d Hat '%s'\n", + AA_DEBUG(DEBUG_DOMAIN, + "%s: (pid %d) Magic 0x%llx count %d Hat '%s'\n", __func__, current->pid, token, count, "<NULL>"); return aa_change_hat(hats, count, token, flags); diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index dcc94c3153d5..8e80db3ae21c 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -89,8 +89,7 @@ static int profile_setrlimit(const struct cred *subj_cred, struct aa_profile *profile, unsigned int resource, struct rlimit *new_rlim) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; int e = 0; if (rules->rlimits.mask & (1 << resource) && new_rlim->rlim_max > @@ -165,9 +164,7 @@ void __aa_transition_rlimits(struct aa_label *old_l, struct aa_label *new_l) * to the lesser of the tasks hard limit and the init tasks soft limit */ label_for_each_confined(i, old_l, old) { - struct aa_ruleset *rules = list_first_entry(&old->rules, - typeof(*rules), - list); + struct aa_ruleset *rules = old->label.rules[0]; if (rules->rlimits.mask) { int j; @@ -185,9 +182,7 @@ void __aa_transition_rlimits(struct aa_label *old_l, struct aa_label *new_l) /* set any new hard limits as dictated by the new profile */ label_for_each_confined(i, new_l, new) { - struct aa_ruleset *rules = list_first_entry(&new->rules, - typeof(*rules), - list); + struct aa_ruleset *rules = new->label.rules[0]; int j; if (!rules->rlimits.mask) diff --git a/security/apparmor/task.c b/security/apparmor/task.c index c87fb9f4ac18..c9bc9cc69475 100644 --- a/security/apparmor/task.c +++ b/security/apparmor/task.c @@ -228,8 +228,7 @@ static int profile_ptrace_perm(const struct cred *cred, struct aa_label *peer, u32 request, struct apparmor_audit_data *ad) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), list); + struct aa_ruleset *rules = profile->label.rules[0]; struct aa_perms perms = { }; ad->subj_cred = cred; @@ -246,7 +245,7 @@ static int profile_tracee_perm(const struct cred *cred, struct apparmor_audit_data *ad) { if (profile_unconfined(tracee) || unconfined(tracer) || - !ANY_RULE_MEDIATES(&tracee->rules, AA_CLASS_PTRACE)) + !label_mediates(&tracee->label, AA_CLASS_PTRACE)) return 0; return profile_ptrace_perm(cred, tracee, tracer, request, ad); @@ -260,7 +259,7 @@ static int profile_tracer_perm(const struct cred *cred, if (profile_unconfined(tracer)) return 0; - if (ANY_RULE_MEDIATES(&tracer->rules, AA_CLASS_PTRACE)) + if (label_mediates(&tracer->label, AA_CLASS_PTRACE)) return profile_ptrace_perm(cred, tracer, tracee, request, ad); /* profile uses the old style capability check for ptrace */ @@ -324,9 +323,7 @@ int aa_profile_ns_perm(struct aa_profile *profile, ad->request = request; if (!profile_unconfined(profile)) { - struct aa_ruleset *rules = list_first_entry(&profile->rules, - typeof(*rules), - list); + struct aa_ruleset *rules = profile->label.rules[0]; aa_state_t state; state = RULE_MEDIATES(rules, ad->class); diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile index 11def1ad046c..20bbd461515e 100644 --- a/tools/accounting/Makefile +++ b/tools/accounting/Makefile @@ -2,7 +2,7 @@ CC := $(CROSS_COMPILE)gcc CFLAGS := -I../../usr/include -PROGS := getdelays procacct +PROGS := getdelays procacct delaytop all: $(PROGS) diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c new file mode 100644 index 000000000000..9afb1ffc00ba --- /dev/null +++ b/tools/accounting/delaytop.c @@ -0,0 +1,862 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * delaytop.c - system-wide delay monitoring tool. + * + * This tool provides real-time monitoring and statistics of + * system, container, and task-level delays, including CPU, + * memory, IO, and IRQ. It supports both interactive (top-like), + * and can output delay information for the whole system, specific + * containers (cgroups), or individual tasks (PIDs). + * + * Key features: + * - Collects per-task delay accounting statistics via taskstats. + * - Collects system-wide PSI information. + * - Supports sorting, filtering. + * - Supports both interactive (screen refresh). + * + * Copyright (C) Fan Yu, ZTE Corp. 2025 + * Copyright (C) Wang Yaxin, ZTE Corp. 2025 + * + * Compile with + * gcc -I/usr/src/linux/include delaytop.c -o delaytop + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <fcntl.h> +#include <getopt.h> +#include <signal.h> +#include <time.h> +#include <dirent.h> +#include <ctype.h> +#include <stdbool.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <termios.h> +#include <limits.h> +#include <linux/genetlink.h> +#include <linux/taskstats.h> +#include <linux/cgroupstats.h> + +#define PSI_CPU_SOME "/proc/pressure/cpu" +#define PSI_CPU_FULL "/proc/pressure/cpu" +#define PSI_MEMORY_SOME "/proc/pressure/memory" +#define PSI_MEMORY_FULL "/proc/pressure/memory" +#define PSI_IO_SOME "/proc/pressure/io" +#define PSI_IO_FULL "/proc/pressure/io" +#define PSI_IRQ_FULL "/proc/pressure/irq" + +#define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) + +#define TASK_COMM_LEN 16 +#define MAX_MSG_SIZE 1024 +#define MAX_TASKS 1000 +#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field +#define BOOL_FPRINT(stream, fmt, ...) \ +({ \ + int ret = fprintf(stream, fmt, ##__VA_ARGS__); \ + ret >= 0; \ +}) +#define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" + +/* Program settings structure */ +struct config { + int delay; /* Update interval in seconds */ + int iterations; /* Number of iterations, 0 == infinite */ + int max_processes; /* Maximum number of processes to show */ + char sort_field; /* Field to sort by */ + int output_one_time; /* Output once and exit */ + int monitor_pid; /* Monitor specific PID */ + char *container_path; /* Path to container cgroup */ +}; + +/* PSI statistics structure */ +struct psi_stats { + double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300; + unsigned long long cpu_some_total; + double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300; + unsigned long long cpu_full_total; + double memory_some_avg10, memory_some_avg60, memory_some_avg300; + unsigned long long memory_some_total; + double memory_full_avg10, memory_full_avg60, memory_full_avg300; + unsigned long long memory_full_total; + double io_some_avg10, io_some_avg60, io_some_avg300; + unsigned long long io_some_total; + double io_full_avg10, io_full_avg60, io_full_avg300; + unsigned long long io_full_total; + double irq_full_avg10, irq_full_avg60, irq_full_avg300; + unsigned long long irq_full_total; +}; + +/* Task delay information structure */ +struct task_info { + int pid; + int tgid; + char command[TASK_COMM_LEN]; + unsigned long long cpu_count; + unsigned long long cpu_delay_total; + unsigned long long blkio_count; + unsigned long long blkio_delay_total; + unsigned long long swapin_count; + unsigned long long swapin_delay_total; + unsigned long long freepages_count; + unsigned long long freepages_delay_total; + unsigned long long thrashing_count; + unsigned long long thrashing_delay_total; + unsigned long long compact_count; + unsigned long long compact_delay_total; + unsigned long long wpcopy_count; + unsigned long long wpcopy_delay_total; + unsigned long long irq_count; + unsigned long long irq_delay_total; +}; + +/* Container statistics structure */ +struct container_stats { + int nr_sleeping; /* Number of sleeping processes */ + int nr_running; /* Number of running processes */ + int nr_stopped; /* Number of stopped processes */ + int nr_uninterruptible; /* Number of uninterruptible processes */ + int nr_io_wait; /* Number of processes in IO wait */ +}; + +/* Global variables */ +static struct config cfg; +static struct psi_stats psi; +static struct task_info tasks[MAX_TASKS]; +static int task_count; +static int running = 1; +static struct container_stats container_stats; + +/* Netlink socket variables */ +static int nl_sd = -1; +static int family_id; + +/* Set terminal to non-canonical mode for q-to-quit */ +static struct termios orig_termios; +static void enable_raw_mode(void) +{ + struct termios raw; + + tcgetattr(STDIN_FILENO, &orig_termios); + raw = orig_termios; + raw.c_lflag &= ~(ICANON | ECHO); + tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw); +} +static void disable_raw_mode(void) +{ + tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); +} + +/* Display usage information and command line options */ +static void usage(void) +{ + printf("Usage: delaytop [Options]\n" + "Options:\n" + " -h, --help Show this help message and exit\n" + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" + " -o, --once Display once and exit\n" + " -p, --pid=PID Monitor only the specified PID\n" + " -C, --container=PATH Monitor the container at specified cgroup path\n"); + exit(0); +} + +/* Parse command line arguments and set configuration */ +static void parse_args(int argc, char **argv) +{ + int c; + struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"delay", required_argument, 0, 'd'}, + {"iterations", required_argument, 0, 'n'}, + {"pid", required_argument, 0, 'p'}, + {"once", no_argument, 0, 'o'}, + {"processes", required_argument, 0, 'P'}, + {"container", required_argument, 0, 'C'}, + {0, 0, 0, 0} + }; + + /* Set defaults */ + cfg.delay = 2; + cfg.iterations = 0; + cfg.max_processes = 20; + cfg.sort_field = 'c'; /* Default sort by CPU delay */ + cfg.output_one_time = 0; + cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ + cfg.container_path = NULL; + + while (1) { + int option_index = 0; + + c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + usage(); + break; + case 'd': + cfg.delay = atoi(optarg); + if (cfg.delay < 1) { + fprintf(stderr, "Error: delay must be >= 1.\n"); + exit(1); + } + break; + case 'n': + cfg.iterations = atoi(optarg); + if (cfg.iterations < 0) { + fprintf(stderr, "Error: iterations must be >= 0.\n"); + exit(1); + } + break; + case 'p': + cfg.monitor_pid = atoi(optarg); + if (cfg.monitor_pid < 1) { + fprintf(stderr, "Error: pid must be >= 1.\n"); + exit(1); + } + break; + case 'o': + cfg.output_one_time = 1; + break; + case 'P': + cfg.max_processes = atoi(optarg); + if (cfg.max_processes < 1) { + fprintf(stderr, "Error: processes must be >= 1.\n"); + exit(1); + } + if (cfg.max_processes > MAX_TASKS) { + fprintf(stderr, "Warning: processes capped to %d.\n", + MAX_TASKS); + cfg.max_processes = MAX_TASKS; + } + break; + case 'C': + cfg.container_path = strdup(optarg); + break; + default: + fprintf(stderr, "Try 'delaytop --help' for more information.\n"); + exit(1); + } + } +} + +/* Create a raw netlink socket and bind */ +static int create_nl_socket(void) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd < 0) + return -1; + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) { + fprintf(stderr, "Failed to bind socket when create nl_socket\n"); + close(fd); + return -1; + } + + return fd; +} + +/* Send a command via netlink */ +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct sockaddr_nl nladdr; + struct nlattr *na; + int r, buflen; + char *buf; + + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + +/* Get family ID for taskstats via netlink */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + char name[100]; + + strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1); + name[sizeof(name) - 1] = '\0'; + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) { + fprintf(stderr, "Failed to send cmd for family id\n"); + return 0; + } + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) { + fprintf(stderr, "Failed to receive response for family id\n"); + return 0; + } + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + return id; +} + +static void read_psi_stats(void) +{ + FILE *fp; + char line[256]; + int ret = 0; + /* Zero all fields */ + memset(&psi, 0, sizeof(psi)); + /* CPU pressure */ + fp = fopen(PSI_CPU_SOME, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "some", 4) == 0) { + ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.cpu_some_avg10, &psi.cpu_some_avg60, + &psi.cpu_some_avg300, &psi.cpu_some_total); + if (ret != 4) + fprintf(stderr, "Failed to parse CPU some PSI data\n"); + } else if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.cpu_full_avg10, &psi.cpu_full_avg60, + &psi.cpu_full_avg300, &psi.cpu_full_total); + if (ret != 4) + fprintf(stderr, "Failed to parse CPU full PSI data\n"); + } + } + fclose(fp); + } + /* Memory pressure */ + fp = fopen(PSI_MEMORY_SOME, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "some", 4) == 0) { + ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.memory_some_avg10, &psi.memory_some_avg60, + &psi.memory_some_avg300, &psi.memory_some_total); + if (ret != 4) + fprintf(stderr, "Failed to parse Memory some PSI data\n"); + } else if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.memory_full_avg10, &psi.memory_full_avg60, + &psi.memory_full_avg300, &psi.memory_full_total); + } + if (ret != 4) + fprintf(stderr, "Failed to parse Memory full PSI data\n"); + } + fclose(fp); + } + /* IO pressure */ + fp = fopen(PSI_IO_SOME, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "some", 4) == 0) { + ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.io_some_avg10, &psi.io_some_avg60, + &psi.io_some_avg300, &psi.io_some_total); + if (ret != 4) + fprintf(stderr, "Failed to parse IO some PSI data\n"); + } else if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.io_full_avg10, &psi.io_full_avg60, + &psi.io_full_avg300, &psi.io_full_total); + if (ret != 4) + fprintf(stderr, "Failed to parse IO full PSI data\n"); + } + } + fclose(fp); + } + /* IRQ pressure (only full) */ + fp = fopen(PSI_IRQ_FULL, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.irq_full_avg10, &psi.irq_full_avg60, + &psi.irq_full_avg300, &psi.irq_full_total); + if (ret != 4) + fprintf(stderr, "Failed to parse IRQ full PSI data\n"); + } + } + fclose(fp); + } +} + +static int read_comm(int pid, char *comm_buf, size_t buf_size) +{ + char path[64]; + int ret = -1; + size_t len; + FILE *fp; + + snprintf(path, sizeof(path), "/proc/%d/comm", pid); + fp = fopen(path, "r"); + if (!fp) { + fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid); + return ret; + } + + if (fgets(comm_buf, buf_size, fp)) { + len = strlen(comm_buf); + if (len > 0 && comm_buf[len - 1] == '\n') + comm_buf[len - 1] = '\0'; + ret = 0; + } + + fclose(fp); + + return ret; +} + +static void fetch_and_fill_task_info(int pid, const char *comm) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } resp; + struct taskstats stats; + struct nlattr *nested; + struct nlattr *na; + int nested_len; + int nl_len; + int rc; + + /* Send request for task stats */ + if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) { + fprintf(stderr, "Failed to send request for task stats\n"); + return; + } + + /* Receive response */ + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { + fprintf(stderr, "Failed to receive response for task stats\n"); + return; + } + + /* Parse response */ + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) { + nested = (struct nlattr *) NLA_DATA(na); + nested_len = NLA_PAYLOAD(na->nla_len); + while (nested_len > 0) { + if (nested->nla_type == TASKSTATS_TYPE_STATS) { + memcpy(&stats, NLA_DATA(nested), sizeof(stats)); + if (task_count < MAX_TASKS) { + tasks[task_count].pid = pid; + tasks[task_count].tgid = pid; + strncpy(tasks[task_count].command, comm, + TASK_COMM_LEN - 1); + tasks[task_count].command[TASK_COMM_LEN - 1] = '\0'; + SET_TASK_STAT(task_count, cpu_count); + SET_TASK_STAT(task_count, cpu_delay_total); + SET_TASK_STAT(task_count, blkio_count); + SET_TASK_STAT(task_count, blkio_delay_total); + SET_TASK_STAT(task_count, swapin_count); + SET_TASK_STAT(task_count, swapin_delay_total); + SET_TASK_STAT(task_count, freepages_count); + SET_TASK_STAT(task_count, freepages_delay_total); + SET_TASK_STAT(task_count, thrashing_count); + SET_TASK_STAT(task_count, thrashing_delay_total); + SET_TASK_STAT(task_count, compact_count); + SET_TASK_STAT(task_count, compact_delay_total); + SET_TASK_STAT(task_count, wpcopy_count); + SET_TASK_STAT(task_count, wpcopy_delay_total); + SET_TASK_STAT(task_count, irq_count); + SET_TASK_STAT(task_count, irq_delay_total); + task_count++; + } + break; + } + nested_len -= NLA_ALIGN(nested->nla_len); + nested = NLA_NEXT(nested); + } + } + nl_len -= NLA_ALIGN(na->nla_len); + na = NLA_NEXT(na); + } + return; +} + +static void get_task_delays(void) +{ + char comm[TASK_COMM_LEN]; + struct dirent *entry; + DIR *dir; + int pid; + + task_count = 0; + if (cfg.monitor_pid > 0) { + if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0) + fetch_and_fill_task_info(cfg.monitor_pid, comm); + return; + } + + dir = opendir("/proc"); + if (!dir) { + fprintf(stderr, "Error opening /proc directory\n"); + return; + } + + while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) { + if (!isdigit(entry->d_name[0])) + continue; + pid = atoi(entry->d_name); + if (pid == 0) + continue; + if (read_comm(pid, comm, sizeof(comm)) != 0) + continue; + fetch_and_fill_task_info(pid, comm); + } + closedir(dir); +} + +/* Calculate average delay in milliseconds */ +static double average_ms(unsigned long long total, unsigned long long count) +{ + if (count == 0) + return 0; + return (double)total / 1000000.0 / count; +} + +/* Comparison function for sorting tasks */ +static int compare_tasks(const void *a, const void *b) +{ + const struct task_info *t1 = (const struct task_info *)a; + const struct task_info *t2 = (const struct task_info *)b; + double avg1, avg2; + + switch (cfg.sort_field) { + case 'c': /* CPU */ + avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); + avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); + if (avg1 != avg2) + return avg2 > avg1 ? 1 : -1; + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + + default: + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + } +} + +/* Sort tasks by selected field */ +static void sort_tasks(void) +{ + if (task_count > 0) + qsort(tasks, task_count, sizeof(struct task_info), compare_tasks); +} + +/* Get container statistics via cgroupstats */ +static void get_container_stats(void) +{ + int rc, cfd; + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } req, resp; + struct nlattr *na; + int nl_len; + struct cgroupstats stats; + + /* Check if container path is set */ + if (!cfg.container_path) + return; + + /* Open container cgroup */ + cfd = open(cfg.container_path, O_RDONLY); + if (cfd < 0) { + fprintf(stderr, "Error opening container path: %s\n", cfg.container_path); + return; + } + + /* Send request for container stats */ + if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) { + fprintf(stderr, "Failed to send request for container stats\n"); + close(cfd); + return; + } + + /* Receive response */ + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { + fprintf(stderr, "Failed to receive response for container stats\n"); + close(cfd); + return; + } + + /* Parse response */ + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) { + /* Get the cgroupstats structure */ + memcpy(&stats, NLA_DATA(na), sizeof(stats)); + + /* Fill container stats */ + container_stats.nr_sleeping = stats.nr_sleeping; + container_stats.nr_running = stats.nr_running; + container_stats.nr_stopped = stats.nr_stopped; + container_stats.nr_uninterruptible = stats.nr_uninterruptible; + container_stats.nr_io_wait = stats.nr_io_wait; + break; + } + nl_len -= NLA_ALIGN(na->nla_len); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + } + + close(cfd); +} + +/* Display results to stdout or log file */ +static void display_results(void) +{ + time_t now = time(NULL); + struct tm *tm_now = localtime(&now); + FILE *out = stdout; + char timestamp[32]; + bool suc = true; + int i, count; + + /* Clear terminal screen */ + suc &= BOOL_FPRINT(out, "\033[H\033[J"); + + /* PSI output (one-line, no cat style) */ + suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n"); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU some:", + psi.cpu_some_avg10, + psi.cpu_some_avg60, + psi.cpu_some_avg300, + psi.cpu_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU full:", + psi.cpu_full_avg10, + psi.cpu_full_avg60, + psi.cpu_full_avg300, + psi.cpu_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory full:", + psi.memory_full_avg10, + psi.memory_full_avg60, + psi.memory_full_avg300, + psi.memory_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory some:", + psi.memory_some_avg10, + psi.memory_some_avg60, + psi.memory_some_avg300, + psi.memory_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO full:", + psi.io_full_avg10, + psi.io_full_avg60, + psi.io_full_avg300, + psi.io_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO some:", + psi.io_some_avg10, + psi.io_some_avg60, + psi.io_some_avg300, + psi.io_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IRQ full:", + psi.irq_full_avg10, + psi.irq_full_avg60, + psi.irq_full_avg300, + psi.irq_full_total / 1000); + + if (cfg.container_path) { + suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path); + suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ", + container_stats.nr_running, container_stats.nr_sleeping); + suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n", + container_stats.nr_stopped, container_stats.nr_uninterruptible, + container_stats.nr_io_wait); + } + suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n", + cfg.max_processes); + suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND"); + suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n", + "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)", + "THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)"); + + suc &= BOOL_FPRINT(out, "-----------------------------------------------"); + suc &= BOOL_FPRINT(out, "----------------------------------------------\n"); + count = task_count < cfg.max_processes ? task_count : cfg.max_processes; + + for (i = 0; i < count; i++) { + suc &= BOOL_FPRINT(out, "%5d %5d %-15s", + tasks[i].pid, tasks[i].tgid, tasks[i].command); + suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", + average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), + average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), + average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), + average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), + average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), + average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), + average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), + average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); + } + + suc &= BOOL_FPRINT(out, "\n"); + + if (!suc) + perror("Error writing to output"); +} + +/* Main function */ +int main(int argc, char **argv) +{ + int iterations = 0; + int use_q_quit = 0; + + /* Parse command line arguments */ + parse_args(argc, argv); + + /* Setup netlink socket */ + nl_sd = create_nl_socket(); + if (nl_sd < 0) { + fprintf(stderr, "Error creating netlink socket\n"); + exit(1); + } + + /* Get family ID for taskstats via netlink */ + family_id = get_family_id(nl_sd); + if (!family_id) { + fprintf(stderr, "Error getting taskstats family ID\n"); + close(nl_sd); + exit(1); + } + + if (!cfg.output_one_time) { + use_q_quit = 1; + enable_raw_mode(); + printf("Press 'q' to quit.\n"); + fflush(stdout); + } + + /* Main loop */ + while (running) { + /* Read PSI statistics */ + read_psi_stats(); + + /* Get container stats if container path provided */ + if (cfg.container_path) + get_container_stats(); + + /* Get task delays */ + get_task_delays(); + + /* Sort tasks */ + sort_tasks(); + + /* Display results to stdout or log file */ + display_results(); + + /* Check for iterations */ + if (cfg.iterations > 0 && ++iterations >= cfg.iterations) + break; + + /* Exit if output_one_time is set */ + if (cfg.output_one_time) + break; + + /* Check for 'q' key to quit */ + if (use_q_quit) { + struct timeval tv = {cfg.delay, 0}; + fd_set readfds; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); + + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { + char ch = 0; + + read(STDIN_FILENO, &ch, 1); + if (ch == 'q' || ch == 'Q') { + running = 0; + break; + } + } + } else { + sleep(cfg.delay); + } + } + + /* Restore terminal mode */ + if (use_q_quit) + disable_raw_mode(); + + /* Cleanup */ + close(nl_sd); + if (cfg.container_path) + free(cfg.container_path); + + return 0; +} diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 3feac0482fe9..21cb3c3d1331 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -194,75 +194,108 @@ static int get_family_id(int sd) #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) #define delay_ms(t) (t / 1000000ULL) +/* + * Version compatibility note: + * Field availability depends on taskstats version (t->version), + * corresponding to TASKSTATS_VERSION in kernel headers + * see include/uapi/linux/taskstats.h + * + * Version feature mapping: + * version >= 11 - supports COMPACT statistics + * version >= 13 - supports WPCOPY statistics + * version >= 14 - supports IRQ statistics + * version >= 16 - supports *_max and *_min delay statistics + * + * Always verify version before accessing version-dependent fields + * to maintain backward compatibility. + */ +#define PRINT_CPU_DELAY(version, t) \ + do { \ + if (version >= 16) { \ + printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ + "CPU", "count", "real total", "virtual total", \ + "delay total", "delay average", "delay max", "delay min"); \ + printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ + (unsigned long long)(t)->cpu_count, \ + (unsigned long long)(t)->cpu_run_real_total, \ + (unsigned long long)(t)->cpu_run_virtual_total, \ + (unsigned long long)(t)->cpu_delay_total, \ + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ + delay_ms((double)(t)->cpu_delay_max), \ + delay_ms((double)(t)->cpu_delay_min)); \ + } else { \ + printf("%-10s%15s%15s%15s%15s%15s\n", \ + "CPU", "count", "real total", "virtual total", \ + "delay total", "delay average"); \ + printf(" %15llu%15llu%15llu%15llu%15.3fms\n", \ + (unsigned long long)(t)->cpu_count, \ + (unsigned long long)(t)->cpu_run_real_total, \ + (unsigned long long)(t)->cpu_run_virtual_total, \ + (unsigned long long)(t)->cpu_delay_total, \ + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count)); \ + } \ + } while (0) +#define PRINT_FILED_DELAY(name, version, t, count, total, max, min) \ + do { \ + if (version >= 16) { \ + printf("%-10s%15s%15s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min)); \ + } else { \ + printf("%-10s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average"); \ + printf(" %15llu%15llu%15.3fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count)); \ + } \ + } while (0) + static void print_delayacct(struct taskstats *t) { - printf("\n\nCPU %15s%15s%15s%15s%15s%15s%15s\n" - " %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "IO %15s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "SWAP %15s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "RECLAIM %12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "THRASHING%12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "COMPACT %12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "WPCOPY %12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "IRQ %15s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n", - "count", "real total", "virtual total", - "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->cpu_count, - (unsigned long long)t->cpu_run_real_total, - (unsigned long long)t->cpu_run_virtual_total, - (unsigned long long)t->cpu_delay_total, - average_ms((double)t->cpu_delay_total, t->cpu_count), - delay_ms((double)t->cpu_delay_max), - delay_ms((double)t->cpu_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->blkio_count, - (unsigned long long)t->blkio_delay_total, - average_ms((double)t->blkio_delay_total, t->blkio_count), - delay_ms((double)t->blkio_delay_max), - delay_ms((double)t->blkio_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->swapin_count, - (unsigned long long)t->swapin_delay_total, - average_ms((double)t->swapin_delay_total, t->swapin_count), - delay_ms((double)t->swapin_delay_max), - delay_ms((double)t->swapin_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->freepages_count, - (unsigned long long)t->freepages_delay_total, - average_ms((double)t->freepages_delay_total, t->freepages_count), - delay_ms((double)t->freepages_delay_max), - delay_ms((double)t->freepages_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->thrashing_count, - (unsigned long long)t->thrashing_delay_total, - average_ms((double)t->thrashing_delay_total, t->thrashing_count), - delay_ms((double)t->thrashing_delay_max), - delay_ms((double)t->thrashing_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->compact_count, - (unsigned long long)t->compact_delay_total, - average_ms((double)t->compact_delay_total, t->compact_count), - delay_ms((double)t->compact_delay_max), - delay_ms((double)t->compact_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->wpcopy_count, - (unsigned long long)t->wpcopy_delay_total, - average_ms((double)t->wpcopy_delay_total, t->wpcopy_count), - delay_ms((double)t->wpcopy_delay_max), - delay_ms((double)t->wpcopy_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->irq_count, - (unsigned long long)t->irq_delay_total, - average_ms((double)t->irq_delay_total, t->irq_count), - delay_ms((double)t->irq_delay_max), - delay_ms((double)t->irq_delay_min)); + printf("\n\n"); + + PRINT_CPU_DELAY(t->version, t); + + PRINT_FILED_DELAY("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min); + + PRINT_FILED_DELAY("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min); + + PRINT_FILED_DELAY("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min); + + PRINT_FILED_DELAY("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min); + + if (t->version >= 11) { + PRINT_FILED_DELAY("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min); + } + + if (t->version >= 13) { + PRINT_FILED_DELAY("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min); + } + + if (t->version >= 14) { + PRINT_FILED_DELAY("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min); + } } static void task_context_switch_counts(struct taskstats *t) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 530752ddde8e..c1cfd297aabf 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -229,7 +229,8 @@ #if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) || \ + (defined(__TARGET_ARCH_powerpc)) #define CAN_USE_LOAD_ACQ_STORE_REL #endif diff --git a/tools/testing/selftests/kho/arm64.conf b/tools/testing/selftests/kho/arm64.conf new file mode 100644 index 000000000000..ee696807cd35 --- /dev/null +++ b/tools/testing/selftests/kho/arm64.conf @@ -0,0 +1,9 @@ +QEMU_CMD="qemu-system-aarch64 -M virt -cpu max" +QEMU_KCONFIG=" +CONFIG_SERIAL_AMBA_PL010=y +CONFIG_SERIAL_AMBA_PL010_CONSOLE=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +" +KERNEL_IMAGE="Image" +KERNEL_CMDLINE="console=ttyAMA0" diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c new file mode 100644 index 000000000000..8034e24c6bf6 --- /dev/null +++ b/tools/testing/selftests/kho/init.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef NOLIBC +#include <errno.h> +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <syscall.h> +#include <sys/mount.h> +#include <sys/reboot.h> +#endif + +/* from arch/x86/include/asm/setup.h */ +#define COMMAND_LINE_SIZE 2048 + +/* from include/linux/kexex.h */ +#define KEXEC_FILE_NO_INITRAMFS 0x00000004 + +#define KHO_FINILIZE "/debugfs/kho/out/finalize" +#define KERNEL_IMAGE "/kernel" + +static int mount_filesystems(void) +{ + if (mount("debugfs", "/debugfs", "debugfs", 0, NULL) < 0) + return -1; + + return mount("proc", "/proc", "proc", 0, NULL); +} + +static int kho_enable(void) +{ + const char enable[] = "1"; + int fd; + + fd = open(KHO_FINILIZE, O_RDWR); + if (fd < 0) + return -1; + + if (write(fd, enable, sizeof(enable)) != sizeof(enable)) + return 1; + + close(fd); + return 0; +} + +static long kexec_file_load(int kernel_fd, int initrd_fd, + unsigned long cmdline_len, const char *cmdline, + unsigned long flags) +{ + return syscall(__NR_kexec_file_load, kernel_fd, initrd_fd, cmdline_len, + cmdline, flags); +} + +static int kexec_load(void) +{ + char cmdline[COMMAND_LINE_SIZE]; + ssize_t len; + int fd, err; + + fd = open("/proc/cmdline", O_RDONLY); + if (fd < 0) + return -1; + + len = read(fd, cmdline, sizeof(cmdline)); + close(fd); + if (len < 0) + return -1; + + /* replace \n with \0 */ + cmdline[len - 1] = 0; + fd = open(KERNEL_IMAGE, O_RDONLY); + if (fd < 0) + return -1; + + err = kexec_file_load(fd, -1, len, cmdline, KEXEC_FILE_NO_INITRAMFS); + close(fd); + + return err ? : 0; +} + +int main(int argc, char *argv[]) +{ + if (mount_filesystems()) + goto err_reboot; + + if (kho_enable()) + goto err_reboot; + + if (kexec_load()) + goto err_reboot; + + if (reboot(RB_KEXEC)) + goto err_reboot; + + return 0; + +err_reboot: + reboot(RB_AUTOBOOT); + return -1; +} diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh new file mode 100755 index 000000000000..ec70a17bd476 --- /dev/null +++ b/tools/testing/selftests/kho/vmtest.sh @@ -0,0 +1,183 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -ue + +CROSS_COMPILE="${CROSS_COMPILE:-""}" + +test_dir=$(realpath "$(dirname "$0")") +kernel_dir=$(realpath "$test_dir/../../../..") + +tmp_dir=$(mktemp -d /tmp/kho-test.XXXXXXXX) +headers_dir="$tmp_dir/usr" +initrd_dir="$tmp_dir/initrd" +initrd="$tmp_dir/initrd.cpio" + +source "$test_dir/../kselftest/ktap_helpers.sh" + +function usage() { + cat <<EOF +$0 [-d build_dir] [-j jobs] [-t target_arch] [-h] +Options: + -d) path to the kernel build directory + -j) number of jobs for compilation, similar to -j in make + -t) run test for target_arch, requires CROSS_COMPILE set + supported targets: aarch64, x86_64 + -h) display this help +EOF +} + +function cleanup() { + rm -fr "$tmp_dir" + ktap_finished +} +trap cleanup EXIT + +function skip() { + local msg=${1:-""} + + ktap_test_skip "$msg" + exit "$KSFT_SKIP" +} + +function fail() { + local msg=${1:-""} + + ktap_test_fail "$msg" + exit "$KSFT_FAIL" +} + +function build_kernel() { + local build_dir=$1 + local make_cmd=$2 + local arch_kconfig=$3 + local kimage=$4 + + local kho_config="$tmp_dir/kho.config" + local kconfig="$build_dir/.config" + + # enable initrd, KHO and KHO test in kernel configuration + tee "$kconfig" > "$kho_config" <<EOF +CONFIG_BLK_DEV_INITRD=y +CONFIG_KEXEC_HANDOVER=y +CONFIG_TEST_KEXEC_HANDOVER=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_VM=y +$arch_kconfig +EOF + + make_cmd="$make_cmd -C $kernel_dir O=$build_dir" + $make_cmd olddefconfig + + # verify that kernel confiration has all necessary options + while read -r opt ; do + grep "$opt" "$kconfig" &>/dev/null || skip "$opt is missing" + done < "$kho_config" + + $make_cmd "$kimage" + $make_cmd headers_install INSTALL_HDR_PATH="$headers_dir" +} + +function mkinitrd() { + local kernel=$1 + + mkdir -p "$initrd_dir"/{dev,debugfs,proc} + sudo mknod "$initrd_dir/dev/console" c 5 1 + + "$CROSS_COMPILE"gcc -s -static -Os -nostdinc -I"$headers_dir/include" \ + -fno-asynchronous-unwind-tables -fno-ident -nostdlib \ + -include "$test_dir/../../../include/nolibc/nolibc.h" \ + -o "$initrd_dir/init" "$test_dir/init.c" \ + + cp "$kernel" "$initrd_dir/kernel" + + pushd "$initrd_dir" &>/dev/null + find . | cpio -H newc --create > "$initrd" 2>/dev/null + popd &>/dev/null +} + +function run_qemu() { + local qemu_cmd=$1 + local cmdline=$2 + local kernel=$3 + local serial="$tmp_dir/qemu.serial" + + cmdline="$cmdline kho=on panic=-1" + + $qemu_cmd -m 1G -smp 2 -no-reboot -nographic -nodefaults \ + -accel kvm -accel hvf -accel tcg \ + -serial file:"$serial" \ + -append "$cmdline" \ + -kernel "$kernel" \ + -initrd "$initrd" + + grep "KHO restore succeeded" "$serial" &> /dev/null || fail "KHO failed" +} + +function target_to_arch() { + local target=$1 + + case $target in + aarch64) echo "arm64" ;; + x86_64) echo "x86" ;; + *) skip "architecture $target is not supported" + esac +} + +function main() { + local build_dir="$kernel_dir/.kho" + local jobs=$(($(nproc) * 2)) + local target="$(uname -m)" + + # skip the test if any of the preparation steps fails + set -o errtrace + trap skip ERR + + while getopts 'hd:j:t:' opt; do + case $opt in + d) + build_dir="$OPTARG" + ;; + j) + jobs="$OPTARG" + ;; + t) + target="$OPTARG" + ;; + h) + usage + exit 0 + ;; + *) + echo Unknown argument "$opt" + usage + exit 1 + ;; + esac + done + + ktap_print_header + ktap_set_plan 1 + + if [[ "$target" != "$(uname -m)" ]] && [[ -z "$CROSS_COMPILE" ]]; then + skip "Cross-platform testing needs to specify CROSS_COMPILE" + fi + + mkdir -p "$build_dir" + local arch=$(target_to_arch "$target") + source "$test_dir/$arch.conf" + + # build the kernel and create initrd + # initrd includes the kernel image that will be kexec'ed + local make_cmd="make ARCH=$arch CROSS_COMPILE=$CROSS_COMPILE -j$jobs" + build_kernel "$build_dir" "$make_cmd" "$QEMU_KCONFIG" "$KERNEL_IMAGE" + + local kernel="$build_dir/arch/$arch/boot/$KERNEL_IMAGE" + mkinitrd "$kernel" + + run_qemu "$QEMU_CMD" "$KERNEL_CMDLINE" "$kernel" + + ktap_test_pass "KHO succeeded" +} + +main "$@" diff --git a/tools/testing/selftests/kho/x86.conf b/tools/testing/selftests/kho/x86.conf new file mode 100644 index 000000000000..b419e610ca22 --- /dev/null +++ b/tools/testing/selftests/kho/x86.conf @@ -0,0 +1,7 @@ +QEMU_CMD=qemu-system-x86_64 +QEMU_KCONFIG=" +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +" +KERNEL_IMAGE="bzImage" +KERNEL_CMDLINE="console=ttyS0" diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore index b7dde152e75a..f6be8efd57ea 100644 --- a/tools/testing/selftests/ptrace/.gitignore +++ b/tools/testing/selftests/ptrace/.gitignore @@ -3,3 +3,4 @@ get_syscall_info get_set_sud peeksiginfo vmaccess +set_syscall_info diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c index a40097232967..ba58589a1145 100644 --- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c +++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c @@ -32,12 +32,12 @@ void workload_hint_exit(int signum) fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); if (fd < 0) { - perror("Unable to open workload type feature enable file\n"); + perror("Unable to open workload type feature enable file"); exit(1); } if (write(fd, "0\n", 2) < 0) { - perror("Can't disable workload hints\n"); + perror("Can't disable workload hints"); exit(1); } @@ -68,16 +68,14 @@ int main(int argc, char **argv) exit(1); sprintf(delay_str, "%s\n", argv[1]); - - sprintf(delay_str, "%s\n", argv[1]); fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); if (fd < 0) { - perror("Unable to open workload notification delay\n"); + perror("Unable to open workload notification delay"); exit(1); } if (write(fd, delay_str, strlen(delay_str)) < 0) { - perror("Can't set delay\n"); + perror("Can't set delay"); exit(1); } @@ -94,12 +92,12 @@ int main(int argc, char **argv) /* Enable feature via sysfs knob */ fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); if (fd < 0) { - perror("Unable to open workload type feature enable file\n"); + perror("Unable to open workload type feature enable file"); exit(1); } if (write(fd, "1\n", 2) < 0) { - perror("Can't enable workload hints\n"); + perror("Can't enable workload hints"); exit(1); } @@ -110,7 +108,7 @@ int main(int argc, char **argv) while (1) { fd = open(WORKLOAD_TYPE_INDEX_ATTRIBUTE, O_RDONLY); if (fd < 0) { - perror("Unable to open workload type file\n"); + perror("Unable to open workload type file"); exit(1); } |