summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap3
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs56
-rw-r--r--Documentation/accounting/delay-accounting.rst91
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst2
-rw-r--r--Documentation/dev-tools/kcov.rst7
-rw-r--r--Documentation/filesystems/f2fs.rst122
-rw-r--r--Documentation/filesystems/fuse/fuse-io-uring.rst (renamed from Documentation/filesystems/fuse-io-uring.rst)0
-rw-r--r--Documentation/filesystems/fuse/fuse-io.rst (renamed from Documentation/filesystems/fuse-io.rst)2
-rw-r--r--Documentation/filesystems/fuse/fuse-passthrough.rst (renamed from Documentation/filesystems/fuse-passthrough.rst)0
-rw-r--r--Documentation/filesystems/fuse/fuse.rst (renamed from Documentation/filesystems/fuse.rst)20
-rw-r--r--Documentation/filesystems/fuse/index.rst14
-rw-r--r--Documentation/filesystems/index.rst5
-rw-r--r--Documentation/filesystems/mount_api.rst10
-rw-r--r--Documentation/filesystems/porting.rst12
-rw-r--r--Documentation/filesystems/sysfs.rst2
-rw-r--r--Documentation/translations/zh_CN/filesystems/sysfs.txt2
-rw-r--r--Documentation/translations/zh_TW/filesystems/sysfs.txt2
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/alpha/include/asm/pgtable.h25
-rw-r--r--arch/alpha/mm/init.c27
-rw-r--r--arch/csky/mm/fault.c2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h10
-rw-r--r--arch/microblaze/include/asm/pgtable.h1
-rw-r--r--arch/openrisc/include/asm/pgtable.h17
-rw-r--r--arch/x86/kernel/crash.c25
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c47
-rw-r--r--arch/xtensa/include/asm/pgtable.h1
-rw-r--r--drivers/firmware/efi/efi-init.c29
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gemfs.c9
-rw-r--r--drivers/gpu/drm/v3d/v3d_gemfs.c9
-rw-r--r--drivers/video/fbdev/core/fbcon.c9
-rw-r--r--fs/9p/vfs_inode.c34
-rw-r--r--fs/9p/vfs_inode_dotl.c15
-rw-r--r--fs/afs/dir_edit.c4
-rw-r--r--fs/afs/dir_search.c2
-rw-r--r--fs/afs/internal.h6
-rw-r--r--fs/afs/mntpt.c3
-rw-r--r--fs/configfs/dir.c5
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c30
-rw-r--r--fs/ecryptfs/dentry.c14
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h27
-rw-r--r--fs/ecryptfs/file.c15
-rw-r--r--fs/ecryptfs/inode.c19
-rw-r--r--fs/ecryptfs/main.c24
-rw-r--r--fs/exfat/balloc.c85
-rw-r--r--fs/exfat/dir.c160
-rw-r--r--fs/exfat/exfat_fs.h7
-rw-r--r--fs/exfat/exfat_raw.h6
-rw-r--r--fs/exfat/fatent.c11
-rw-r--r--fs/exfat/file.c52
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/exfat/namei.c4
-rw-r--r--fs/exfat/nls.c2
-rw-r--r--fs/exfat/super.c68
-rw-r--r--fs/ext4/Kconfig27
-rw-r--r--fs/ext4/ext4.h28
-rw-r--r--fs/ext4/fast_commit.c2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsmap.c14
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inode.c47
-rw-r--r--fs/ext4/ioctl.c312
-rw-r--r--fs/ext4/mballoc.c10
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/orphan.c19
-rw-r--r--fs/ext4/super.c38
-rw-r--r--fs/ext4/xattr.c21
-rw-r--r--fs/f2fs/checkpoint.c53
-rw-r--r--fs/f2fs/compress.c43
-rw-r--r--fs/f2fs/data.c59
-rw-r--r--fs/f2fs/dir.c17
-rw-r--r--fs/f2fs/extent_cache.c15
-rw-r--r--fs/f2fs/f2fs.h88
-rw-r--r--fs/f2fs/file.c49
-rw-r--r--fs/f2fs/gc.c25
-rw-r--r--fs/f2fs/node.c77
-rw-r--r--fs/f2fs/node.h1
-rw-r--r--fs/f2fs/recovery.c2
-rw-r--r--fs/f2fs/segment.c30
-rw-r--r--fs/f2fs/segment.h28
-rw-r--r--fs/f2fs/super.c121
-rw-r--r--fs/f2fs/sysfs.c119
-rw-r--r--fs/fat/dir.c7
-rw-r--r--fs/fs_context.c17
-rw-r--r--fs/fuse/Kconfig2
-rw-r--r--fs/fuse/Makefile5
-rw-r--r--fs/fuse/backing.c179
-rw-r--r--fs/fuse/cuse.c3
-rw-r--r--fs/fuse/dev.c227
-rw-r--r--fs/fuse/dev_uring.c8
-rw-r--r--fs/fuse/dir.c21
-rw-r--r--fs/fuse/file.c86
-rw-r--r--fs/fuse/fuse_dev_i.h13
-rw-r--r--fs/fuse/fuse_i.h70
-rw-r--r--fs/fuse/inode.c76
-rw-r--r--fs/fuse/iomode.c3
-rw-r--r--fs/fuse/passthrough.c167
-rw-r--r--fs/fuse/trace.c13
-rw-r--r--fs/fuse/virtio_fs.c12
-rw-r--r--fs/gfs2/inode.c26
-rw-r--r--fs/internal.h4
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jfs/inode.c8
-rw-r--r--fs/jfs/jfs_dtree.c4
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_mount.c10
-rw-r--r--fs/jfs/jfs_txnmgr.c9
-rw-r--r--fs/lockd/svc.c6
-rw-r--r--fs/mount.h39
-rw-r--r--fs/namespace.c1004
-rw-r--r--fs/nfs/blocklayout/blocklayout.c8
-rw-r--r--fs/nfs/blocklayout/dev.c8
-rw-r--r--fs/nfs/callback.c10
-rw-r--r--fs/nfs/dir.c26
-rw-r--r--fs/nfs/file.c29
-rw-r--r--fs/nfs/filelayout/filelayout.c10
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c10
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c786
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h64
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c115
-rw-r--r--fs/nfs/fs_context.c3
-rw-r--r--fs/nfs/inode.c15
-rw-r--r--fs/nfs/internal.h10
-rw-r--r--fs/nfs/localio.c403
-rw-r--r--fs/nfs/namespace.c3
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42proc.c4
-rw-r--r--fs/nfs/nfs42xdr.c2
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4proc.c12
-rw-r--r--fs/nfs/nfs4state.c3
-rw-r--r--fs/nfs/nfs4super.c44
-rw-r--r--fs/nfs/nfs4xdr.c4
-rw-r--r--fs/nfs/nfstrace.h215
-rw-r--r--fs/nfs/write.c34
-rw-r--r--fs/nfsd/filecache.c34
-rw-r--r--fs/nfsd/filecache.h4
-rw-r--r--fs/nfsd/localio.c11
-rw-r--r--fs/nfsd/nfsctl.c139
-rw-r--r--fs/nfsd/nfssvc.c7
-rw-r--r--fs/nfsd/trace.h27
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/notify/fanotify/fanotify.h2
-rw-r--r--fs/notify/fanotify/fanotify_user.c105
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/ntfs3/bitmap.c1
-rw-r--r--fs/ntfs3/file.c28
-rw-r--r--fs/ntfs3/index.c10
-rw-r--r--fs/ntfs3/inode.c1
-rw-r--r--fs/ntfs3/ntfs_fs.h2
-rw-r--r--fs/ntfs3/run.c12
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c11
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/inode.c8
-rw-r--r--fs/ocfs2/ioctl.c18
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/stack_user.c1
-rw-r--r--fs/ocfs2/sysfile.c12
-rw-r--r--fs/open.c10
-rw-r--r--fs/orangefs/namei.c10
-rw-r--r--fs/orangefs/orangefs-debugfs.c11
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/orangefs/xattr.c12
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/dir.c29
-rw-r--r--fs/overlayfs/inode.c1
-rw-r--r--fs/overlayfs/namei.c17
-rw-r--r--fs/overlayfs/overlayfs.h8
-rw-r--r--fs/overlayfs/ovl_entry.h1
-rw-r--r--fs/overlayfs/params.c15
-rw-r--r--fs/overlayfs/params.h1
-rw-r--r--fs/overlayfs/readdir.c126
-rw-r--r--fs/overlayfs/super.c64
-rw-r--r--fs/overlayfs/util.c6
-rw-r--r--fs/pnode.c75
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/quota/dquot.c10
-rw-r--r--fs/smb/client/Kconfig1
-rw-r--r--fs/smb/client/cached_dir.c50
-rw-r--r--fs/smb/client/cached_dir.h16
-rw-r--r--fs/smb/client/cifs_debug.c41
-rw-r--r--fs/smb/client/cifsencrypt.c8
-rw-r--r--fs/smb/client/cifsfs.c40
-rw-r--r--fs/smb/client/dir.c54
-rw-r--r--fs/smb/client/fs_context.c11
-rw-r--r--fs/smb/client/inode.c2
-rw-r--r--fs/smb/client/readdir.c40
-rw-r--r--fs/smb/client/smb2ops.c22
-rw-r--r--fs/smb/client/smb2pdu.c18
-rw-r--r--fs/smb/client/transport.c13
-rw-r--r--fs/smb/common/Makefile1
-rw-r--r--fs/smb/common/arc4.h23
-rw-r--r--fs/smb/common/cifs_arc4.c75
-rw-r--r--fs/smb/server/Kconfig1
-rw-r--r--fs/smb/server/auth.c9
-rw-r--r--fs/smb/server/connection.c23
-rw-r--r--fs/smb/server/connection.h6
-rw-r--r--fs/smb/server/ksmbd_netlink.h5
-rw-r--r--fs/smb/server/mgmt/share_config.c2
-rw-r--r--fs/smb/server/mgmt/user_session.c28
-rw-r--r--fs/smb/server/server.h1
-rw-r--r--fs/smb/server/smb2pdu.c7
-rw-r--r--fs/smb/server/transport_ipc.c3
-rw-r--r--fs/smb/server/transport_rdma.c5
-rw-r--r--fs/smb/server/transport_tcp.c98
-rw-r--r--fs/smb/server/vfs.c16
-rw-r--r--fs/squashfs/file.c137
-rw-r--r--fs/squashfs/inode.c39
-rw-r--r--fs/squashfs/squashfs.h1
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h2
-rw-r--r--fs/super.c3
-rw-r--r--fs/udf/inode.c3
-rw-r--r--fs/vboxsf/dir.c25
-rw-r--r--include/linux/backing-dev.h14
-rw-r--r--include/linux/dcache.h5
-rw-r--r--include/linux/f2fs_fs.h1
-rw-r--r--include/linux/fs.h37
-rw-r--r--include/linux/fs_context.h9
-rw-r--r--include/linux/fsnotify_backend.h2
-rw-r--r--include/linux/idr.h8
-rw-r--r--include/linux/kernel.h21
-rw-r--r--include/linux/kexec.h5
-rw-r--r--include/linux/kexec_handover.h6
-rw-r--r--include/linux/list.h8
-rw-r--r--include/linux/lsm_hook_defs.h2
-rw-r--r--include/linux/moduleparam.h13
-rw-r--r--include/linux/mount.h9
-rw-r--r--include/linux/nfs_page.h2
-rw-r--r--include/linux/nfs_xdr.h4
-rw-r--r--include/linux/nfslocalio.h2
-rw-r--r--include/linux/nvmem-provider.h2
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/panic.h6
-rw-r--r--include/linux/printk.h2
-rw-r--r--include/linux/sched/task.h5
-rw-r--r--include/linux/security.h4
-rw-r--r--include/linux/sunrpc/debug.h30
-rw-r--r--include/linux/sunrpc/svc.h4
-rw-r--r--include/linux/sunrpc/svc_xprt.h3
-rw-r--r--include/linux/sunrpc/xdr.h8
-rw-r--r--include/linux/wait.h12
-rw-r--r--include/trace/misc/fs.h22
-rw-r--r--include/uapi/linux/ext4.h53
-rw-r--r--include/uapi/linux/fuse.h22
-rw-r--r--include/uapi/linux/kexec.h4
-rw-r--r--init/main.c12
-rw-r--r--kernel/Kconfig.kexec11
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit_tree.c12
-rw-r--r--kernel/crash_core.c30
-rw-r--r--kernel/crash_core_test.c343
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/hung_task.c78
-rw-r--r--kernel/kallsyms_selftest.c2
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kexec_file.c1
-rw-r--r--kernel/kexec_handover.c22
-rw-r--r--kernel/panic.c129
-rw-r--r--kernel/printk/internal.h1
-rw-r--r--kernel/printk/nbcon.c14
-rw-r--r--kernel/printk/printk.c37
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/sys.c32
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/watchdog.c28
-rw-r--r--kernel/watchdog_perf.c4
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/alloc_tag.c3
-rw-r--r--lib/btree.c4
-rw-r--r--lib/decompress.c21
-rw-r--r--lib/digsig.c1
-rw-r--r--lib/dump_stack.c2
-rw-r--r--lib/fault-inject-usercopy.c4
-rw-r--r--lib/genalloc.c5
-rw-r--r--lib/ref_tracker.c6
-rw-r--r--lib/sys_info.c3
-rw-r--r--lib/test_firmware.c7
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/filemap.c34
-rw-r--r--mm/page-writeback.c45
-rw-r--r--net/sunrpc/Kconfig14
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c8
-rw-r--r--net/sunrpc/sched.c2
-rw-r--r--net/sunrpc/socklib.c2
-rw-r--r--net/sunrpc/svc.c11
-rw-r--r--net/sunrpc/svc_xprt.c7
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c2
-rwxr-xr-xscripts/checkpatch.pl14
-rw-r--r--scripts/coccinelle/api/platform_no_drv_owner.cocci9
-rw-r--r--scripts/coccinelle/misc/of_table.cocci14
-rw-r--r--security/security.c2
-rw-r--r--security/selinux/hooks.c2
-rw-r--r--security/smack/smack_lsm.c2
-rw-r--r--tools/accounting/delaytop.c571
-rw-r--r--tools/testing/radix-tree/idr-test.c16
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/filesystems/fuse/.gitignore3
-rw-r--r--tools/testing/selftests/filesystems/fuse/Makefile21
-rw-r--r--tools/testing/selftests/filesystems/fuse/fuse_mnt.c146
-rw-r--r--tools/testing/selftests/filesystems/fuse/fusectl_test.c140
-rw-r--r--tools/testing/selftests/proc/.gitignore1
-rw-r--r--tools/testing/selftests/proc/Makefile1
-rw-r--r--tools/testing/selftests/proc/proc-net-dev-lseek.c68
-rw-r--r--tools/testing/selftests/proc/proc-pid-vm.c12
313 files changed, 7148 insertions, 3134 deletions
diff --git a/.mailmap b/.mailmap
index d30f9363a4c9..644b0dcffa39 100644
--- a/.mailmap
+++ b/.mailmap
@@ -721,7 +721,8 @@ Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
Shuah Khan <shuah@kernel.org> <shuahkh@osg.samsung.com>
Shuah Khan <shuah@kernel.org> <shuah.kh@samsung.com>
-Sibi Sankar <quic_sibis@quicinc.com> <sibis@codeaurora.org>
+Sibi Sankar <sibi.sankar@oss.qualcomm.com> <sibis@codeaurora.org>
+Sibi Sankar <sibi.sankar@oss.qualcomm.com> <quic_sibis@quicinc.com>
Sid Manning <quic_sidneym@quicinc.com> <sidneym@codeaurora.org>
Simon Arlott <simon@octiron.net> <simon@fire.lp0.eu>
Simona Vetter <simona.vetter@ffwll.ch> <daniel.vetter@ffwll.ch>
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index bc0e7fefc39d..b590809869ca 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -822,8 +822,8 @@ What: /sys/fs/f2fs/<disk>/gc_valid_thresh_ratio
Date: September 2024
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: It controls the valid block ratio threshold not to trigger excessive GC
- for zoned deivces. The initial value of it is 95(%). F2FS will stop the
- background GC thread from intiating GC for sections having valid blocks
+ for zoned devices. The initial value of it is 95(%). F2FS will stop the
+ background GC thread from initiating GC for sections having valid blocks
exceeding the ratio.
What: /sys/fs/f2fs/<disk>/max_read_extent_count
@@ -847,7 +847,7 @@ Description: For several zoned storage devices, vendors will provide extra space
filesystem level GC. To do that, we can reserve the space using
reserved_blocks. However, it is not enough, since this extra space should
not be shown to users. So, with this new sysfs node, we can hide the space
- by substracting reserved_blocks from total bytes.
+ by subtracting reserved_blocks from total bytes.
What: /sys/fs/f2fs/<disk>/encoding_flags
Date: April 2025
@@ -883,3 +883,53 @@ Date: June 2025
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Control GC algorithm for boost GC. 0: cost benefit, 1: greedy
Default: 1
+
+What: /sys/fs/f2fs/<disk>/effective_lookup_mode
+Date: August 2025
+Contact: "Daniel Lee" <chullee@google.com>
+Description:
+ This is a read-only entry to show the effective directory lookup mode
+ F2FS is currently using for casefolded directories.
+ This considers both the "lookup_mode" mount option and the on-disk
+ encoding flag, SB_ENC_NO_COMPAT_FALLBACK_FL.
+
+ Possible values are:
+ - "perf": Hash-only lookup.
+ - "compat": Hash-based lookup with a linear search fallback enabled
+ - "auto:perf": lookup_mode is auto and fallback is disabled on-disk
+ - "auto:compat": lookup_mode is auto and fallback is enabled on-disk
+
+What: /sys/fs/f2fs/<disk>/bggc_io_aware
+Date: August 2025
+Contact: "Liao Yuanhong" <liaoyuanhong@vivo.com>
+Description: Used to adjust the BG_GC priority when pending IO, with a default value
+ of 0. Specifically, for ZUFS, the default value is 1.
+
+ ================== ======================================================
+ value description
+ bggc_io_aware = 0 skip background GC if there is any kind of pending IO
+ bggc_io_aware = 1 skip background GC if there is pending read IO
+ bggc_io_aware = 2 don't aware IO for background GC
+ ================== ======================================================
+
+What: /sys/fs/f2fs/<disk>/allocate_section_hint
+Date: August 2025
+Contact: "Liao Yuanhong" <liaoyuanhong@vivo.com>
+Description: Indicates the hint section between the first device and others in multi-devices
+ setup. It defaults to the end of the first device in sections. For a single storage
+ device, it defaults to the total number of sections. It can be manually set to match
+ scenarios where multi-devices are mapped to the same dm device.
+
+What: /sys/fs/f2fs/<disk>/allocate_section_policy
+Date: August 2025
+Contact: "Liao Yuanhong" <liaoyuanhong@vivo.com>
+Description: Controls write priority in multi-devices setups. A value of 0 means normal writing.
+ A value of 1 prioritizes writing to devices before the allocate_section_hint. A value of 2
+ prioritizes writing to devices after the allocate_section_hint. The default is 0.
+
+ =========================== ==========================================================
+ value description
+ allocate_section_policy = 0 Normal writing
+ allocate_section_policy = 1 Prioritize writing to section before allocate_section_hint
+ allocate_section_policy = 2 Prioritize writing to section after allocate_section_hint
+ =========================== ==========================================================
diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 8ccc5af5ea1e..86d7902a657f 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -134,47 +134,72 @@ The above command can be used with -v to get more debug information.
After the system starts, use `delaytop` to get the system-wide delay information,
which includes system-wide PSI information and Top-N high-latency tasks.
+Note: PSI support requires `CONFIG_PSI=y` and `psi=1` for full functionality.
-`delaytop` supports sorting by CPU latency in descending order by default,
-displays the top 20 high-latency tasks by default, and refreshes the latency
-data every 2 seconds by default.
+`delaytop` is an interactive tool for monitoring system pressure and task delays.
+It supports multiple sorting options, display modes, and real-time keyboard controls.
-Get PSI information and Top-N tasks delay, since system boot::
+Basic usage with default settings (sorts by CPU delay, shows top 20 tasks, refreshes every 2 seconds)::
bash# ./delaytop
- System Pressure Information: (avg10/avg60/avg300/total)
- CPU some: 0.0%/ 0.0%/ 0.0%/ 345(ms)
+ System Pressure Information: (avg10/avg60vg300/total)
+ CPU some: 0.0%/ 0.0%/ 0.0%/ 106137(ms)
CPU full: 0.0%/ 0.0%/ 0.0%/ 0(ms)
Memory full: 0.0%/ 0.0%/ 0.0%/ 0(ms)
Memory some: 0.0%/ 0.0%/ 0.0%/ 0(ms)
- IO full: 0.0%/ 0.0%/ 0.0%/ 65(ms)
- IO some: 0.0%/ 0.0%/ 0.0%/ 79(ms)
+ IO full: 0.0%/ 0.0%/ 0.0%/ 2240(ms)
+ IO some: 0.0%/ 0.0%/ 0.0%/ 2783(ms)
IRQ full: 0.0%/ 0.0%/ 0.0%/ 0(ms)
- Top 20 processes (sorted by CPU delay):
- PID TGID COMMAND CPU(ms) IO(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms)
- ----------------------------------------------------------------------------------------------
- 161 161 zombie_memcg_re 1.40 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 130 130 blkcg_punt_bio 1.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 444 444 scsi_tmf_0 0.73 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 1280 1280 rsyslogd 0.53 0.04 0.00 0.00 0.00 0.00 0.00 0.00
- 12 12 ksoftirqd/0 0.47 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 1277 1277 nbd-server 0.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 308 308 kworker/2:2-sys 0.41 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 55 55 netns 0.36 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 1187 1187 acpid 0.31 0.03 0.00 0.00 0.00 0.00 0.00 0.00
- 6184 6184 kworker/1:2-sys 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 186 186 kaluad 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 18 18 ksoftirqd/1 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 185 185 kmpath_rdacd 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 190 190 kstrp 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 2759 2759 agetty 0.20 0.03 0.00 0.00 0.00 0.00 0.00 0.00
- 1190 1190 kworker/0:3-sys 0.19 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 1272 1272 sshd 0.15 0.04 0.00 0.00 0.00 0.00 0.00 0.00
- 1156 1156 license 0.15 0.11 0.00 0.00 0.00 0.00 0.00 0.00
- 134 134 md 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00
- 6142 6142 kworker/3:2-xfs 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00
-
-Dynamic interactive interface of delaytop::
+ [o]sort [M]memverbose [q]quit
+ Top 20 processes (sorted by cpu delay):
+ PID TGID COMMAND CPU(ms) IO(ms) IRQ(ms) MEM(ms)
+ ------------------------------------------------------------------------
+ 110 110 kworker/15:0H-s 27.91 0.00 0.00 0.00
+ 57 57 cpuhp/7 3.18 0.00 0.00 0.00
+ 99 99 cpuhp/14 2.97 0.00 0.00 0.00
+ 51 51 cpuhp/6 0.90 0.00 0.00 0.00
+ 44 44 kworker/4:0H-sy 0.80 0.00 0.00 0.00
+ 60 60 ksoftirqd/7 0.74 0.00 0.00 0.00
+ 76 76 idle_inject/10 0.31 0.00 0.00 0.00
+ 100 100 idle_inject/14 0.30 0.00 0.00 0.00
+ 1309 1309 systemsettings 0.29 0.00 0.00 0.00
+ 45 45 cpuhp/5 0.22 0.00 0.00 0.00
+ 63 63 cpuhp/8 0.20 0.00 0.00 0.00
+ 87 87 cpuhp/12 0.18 0.00 0.00 0.00
+ 93 93 cpuhp/13 0.17 0.00 0.00 0.00
+ 1265 1265 acpid 0.17 0.00 0.00 0.00
+ 1552 1552 sshd 0.17 0.00 0.00 0.00
+ 2584 2584 sddm-helper 0.16 0.00 0.00 0.00
+ 1284 1284 rtkit-daemon 0.15 0.00 0.00 0.00
+ 1326 1326 nde-netfilter 0.14 0.00 0.00 0.00
+ 27 27 cpuhp/2 0.13 0.00 0.00 0.00
+ 631 631 kworker/11:2-rc 0.11 0.00 0.00 0.00
+
+Interactive keyboard controls during runtime::
+
+ o - Select sort field (CPU, IO, IRQ, Memory, etc.)
+ M - Toggle display mode (Default/Memory Verbose)
+ q - Quit
+
+Available sort fields(use -s/--sort or interactive command)::
+
+ cpu(c) - CPU delay
+ blkio(i) - I/O delay
+ irq(q) - IRQ delay
+ mem(m) - Total memory delay
+ swapin(s) - Swapin delay (memory verbose mode only)
+ freepages(r) - Freepages reclaim delay (memory verbose mode only)
+ thrashing(t) - Thrashing delay (memory verbose mode only)
+ compact(p) - Compaction delay (memory verbose mode only)
+ wpcopy(w) - Write page copy delay (memory verbose mode only)
+
+Advanced usage examples::
+
+ # ./delaytop -s blkio
+ Sorted by IO delay
+
+ # ./delaytop -s mem -M
+ Sorted by memory delay in memory verbose mode
# ./delaytop -p pid
Print delayacct stats
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e019db1633fd..74ca438d2d6d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4603,7 +4603,7 @@
bit 2: print timer info
bit 3: print locks info if CONFIG_LOCKDEP is on
bit 4: print ftrace buffer
- bit 5: replay all messages on consoles at the end of panic
+ bit 5: replay all kernel messages on consoles at the end of panic
bit 6: print all CPUs backtrace (if available in the arch)
bit 7: print only tasks in uninterruptible (blocked) state
*Be aware* that this option may print a _lot_ of lines,
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 8b49eab937d0..f3ee807b5d8b 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -890,7 +890,7 @@ bit 1 print system memory info
bit 2 print timer info
bit 3 print locks info if ``CONFIG_LOCKDEP`` is on
bit 4 print ftrace buffer
-bit 5 replay all messages on consoles at the end of panic
+bit 5 replay all kernel messages on consoles at the end of panic
bit 6 print all CPUs backtrace (if available in the arch)
bit 7 print only tasks in uninterruptible (blocked) state
===== ============================================
diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index 6611434e2dd2..8127849d40f5 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -361,7 +361,12 @@ local tasks spawned by the process and the global task that handles USB bus #1:
*/
sleep(2);
- n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+ /*
+ * The load to the coverage count should be an acquire to pair with
+ * pair with the corresponding write memory barrier (smp_wmb()) on
+ * the kernel-side in kcov_move_area().
+ */
+ n = __atomic_load_n(&cover[0], __ATOMIC_ACQUIRE);
for (i = 0; i < n; i++)
printf("0x%lx\n", cover[i + 1]);
if (ioctl(fd, KCOV_DISABLE, 0))
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index e5bb89452aff..a8d02fe5be83 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -1,8 +1,11 @@
.. SPDX-License-Identifier: GPL-2.0
-==========================================
-WHAT IS Flash-Friendly File System (F2FS)?
-==========================================
+=================================
+Flash-Friendly File System (F2FS)
+=================================
+
+Overview
+========
NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have
been equipped on a variety systems ranging from mobile to server systems. Since
@@ -173,9 +176,12 @@ data_flush Enable data flushing before checkpoint in order to
persist data of regular and symlink.
reserve_root=%d Support configuring reserved space which is used for
allocation from a privileged user with specified uid or
- gid, unit: 4KB, the default limit is 0.2% of user blocks.
-resuid=%d The user ID which may use the reserved blocks.
-resgid=%d The group ID which may use the reserved blocks.
+ gid, unit: 4KB, the default limit is 12.5% of user blocks.
+reserve_node=%d Support configuring reserved nodes which are used for
+ allocation from a privileged user with specified uid or
+ gid, the default limit is 12.5% of all nodes.
+resuid=%d The user ID which may use the reserved blocks and nodes.
+resgid=%d The group ID which may use the reserved blocks and nodes.
fault_injection=%d Enable fault injection in all supported types with
specified injection rate.
fault_type=%d Support configuring fault injection type, should be
@@ -291,9 +297,13 @@ compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo"
"lz4", "zstd" and "lzo-rle" algorithm.
compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only
"lz4" and "zstd" support compress level config.
+
+ ========= ===========
algorithm level range
+ ========= ===========
lz4 3 - 16
zstd 1 - 22
+ ========= ===========
compress_log_size=%u Support configuring compress cluster size. The size will
be 4KB * (1 << %u). The default and minimum sizes are 16KB.
compress_extension=%s Support adding specified extension, so that f2fs can enable
@@ -357,6 +367,7 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes:
panic immediately, continue without doing anything, and remount
the partition in read-only mode. By default it uses "continue"
mode.
+
====================== =============== =============== ========
mode continue remount-ro panic
====================== =============== =============== ========
@@ -370,6 +381,25 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes:
====================== =============== =============== ========
nat_bits Enable nat_bits feature to enhance full/empty nat blocks access,
by default it's disabled.
+lookup_mode=%s Control the directory lookup behavior for casefolded
+ directories. This option has no effect on directories
+ that do not have the casefold feature enabled.
+
+ ================== ========================================
+ Value Description
+ ================== ========================================
+ perf (Default) Enforces a hash-only lookup.
+ The linear search fallback is always
+ disabled, ignoring the on-disk flag.
+ compat Enables the linear search fallback for
+ compatibility with directory entries
+ created by older kernel that used a
+ different case-folding algorithm.
+ This mode ignores the on-disk flag.
+ auto F2FS determines the mode based on the
+ on-disk `SB_ENC_NO_COMPAT_FALLBACK_FL`
+ flag.
+ ================== ========================================
======================== ============================================================
Debugfs Entries
@@ -795,11 +825,13 @@ ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
extension list " "
-- buffered io
+------------------------------------------------------------------
N/A COLD_DATA WRITE_LIFE_EXTREME
N/A HOT_DATA WRITE_LIFE_SHORT
N/A WARM_DATA WRITE_LIFE_NOT_SET
-- direct io
+------------------------------------------------------------------
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
@@ -915,24 +947,26 @@ compression enabled files (refer to "Compression implementation" section for how
enable compression on a regular inode).
1) compress_mode=fs
-This is the default option. f2fs does automatic compression in the writeback of the
-compression enabled files.
+
+ This is the default option. f2fs does automatic compression in the writeback of the
+ compression enabled files.
2) compress_mode=user
-This disables the automatic compression and gives the user discretion of choosing the
-target file and the timing. The user can do manual compression/decompression on the
-compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE
-ioctls like the below.
-To decompress a file,
+ This disables the automatic compression and gives the user discretion of choosing the
+ target file and the timing. The user can do manual compression/decompression on the
+ compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE
+ ioctls like the below.
+
+To decompress a file::
-fd = open(filename, O_WRONLY, 0);
-ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE);
+ fd = open(filename, O_WRONLY, 0);
+ ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE);
-To compress a file,
+To compress a file::
-fd = open(filename, O_WRONLY, 0);
-ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE);
+ fd = open(filename, O_WRONLY, 0);
+ ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE);
NVMe Zoned Namespace devices
----------------------------
@@ -962,32 +996,32 @@ reserved and used by another filesystem or for different purposes. Once that
external usage is complete, the device aliasing file can be deleted, releasing
the reserved space back to F2FS for its own use.
-<use-case>
-
-# ls /dev/vd*
-/dev/vdb (32GB) /dev/vdc (32GB)
-# mkfs.ext4 /dev/vdc
-# mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb
-# mount /dev/vdb /mnt/f2fs
-# ls -l /mnt/f2fs
-vdc.file
-# df -h
-/dev/vdb 64G 33G 32G 52% /mnt/f2fs
-
-# mount -o loop /dev/vdc /mnt/ext4
-# df -h
-/dev/vdb 64G 33G 32G 52% /mnt/f2fs
-/dev/loop7 32G 24K 30G 1% /mnt/ext4
-# umount /mnt/ext4
-
-# f2fs_io getflags /mnt/f2fs/vdc.file
-get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable
-# f2fs_io setflags noimmutable /mnt/f2fs/vdc.file
-get a flag on noimmutable ret=0, flags=800010
-set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable
-# rm /mnt/f2fs/vdc.file
-# df -h
-/dev/vdb 64G 753M 64G 2% /mnt/f2fs
+.. code-block::
+
+ # ls /dev/vd*
+ /dev/vdb (32GB) /dev/vdc (32GB)
+ # mkfs.ext4 /dev/vdc
+ # mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb
+ # mount /dev/vdb /mnt/f2fs
+ # ls -l /mnt/f2fs
+ vdc.file
+ # df -h
+ /dev/vdb 64G 33G 32G 52% /mnt/f2fs
+
+ # mount -o loop /dev/vdc /mnt/ext4
+ # df -h
+ /dev/vdb 64G 33G 32G 52% /mnt/f2fs
+ /dev/loop7 32G 24K 30G 1% /mnt/ext4
+ # umount /mnt/ext4
+
+ # f2fs_io getflags /mnt/f2fs/vdc.file
+ get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable
+ # f2fs_io setflags noimmutable /mnt/f2fs/vdc.file
+ get a flag on noimmutable ret=0, flags=800010
+ set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable
+ # rm /mnt/f2fs/vdc.file
+ # df -h
+ /dev/vdb 64G 753M 64G 2% /mnt/f2fs
So, the key idea is, user can do any file operations on /dev/vdc, and
reclaim the space after the use, while the space is counted as /data.
diff --git a/Documentation/filesystems/fuse-io-uring.rst b/Documentation/filesystems/fuse/fuse-io-uring.rst
index d73dd0dbd238..d73dd0dbd238 100644
--- a/Documentation/filesystems/fuse-io-uring.rst
+++ b/Documentation/filesystems/fuse/fuse-io-uring.rst
diff --git a/Documentation/filesystems/fuse-io.rst b/Documentation/filesystems/fuse/fuse-io.rst
index 6464de4266ad..d736ac4cb483 100644
--- a/Documentation/filesystems/fuse-io.rst
+++ b/Documentation/filesystems/fuse/fuse-io.rst
@@ -1,7 +1,7 @@
.. SPDX-License-Identifier: GPL-2.0
==============
-Fuse I/O Modes
+FUSE I/O Modes
==============
Fuse supports the following I/O modes:
diff --git a/Documentation/filesystems/fuse-passthrough.rst b/Documentation/filesystems/fuse/fuse-passthrough.rst
index 2b0e7c2da54a..2b0e7c2da54a 100644
--- a/Documentation/filesystems/fuse-passthrough.rst
+++ b/Documentation/filesystems/fuse/fuse-passthrough.rst
diff --git a/Documentation/filesystems/fuse.rst b/Documentation/filesystems/fuse/fuse.rst
index 1e31e87aee68..0fbd5a03fdc9 100644
--- a/Documentation/filesystems/fuse.rst
+++ b/Documentation/filesystems/fuse/fuse.rst
@@ -1,8 +1,8 @@
.. SPDX-License-Identifier: GPL-2.0
-====
-FUSE
-====
+=============
+FUSE Overview
+=============
Definitions
===========
@@ -129,6 +129,20 @@ For each connection the following files exist within this directory:
connection. This means that all waiting requests will be aborted an
error returned for all aborted and new requests.
+ max_background
+ The maximum number of background requests that can be outstanding
+ at a time. When the number of background requests reaches this limit,
+ further requests will be blocked until some are completed, potentially
+ causing I/O operations to stall.
+
+ congestion_threshold
+ The threshold of background requests at which the kernel considers
+ the filesystem to be congested. When the number of background requests
+ exceeds this value, the kernel will skip asynchronous readahead
+ operations, reducing read-ahead optimizations but preserving essential
+ I/O, as well as suspending non-synchronous writeback operations
+ (WB_SYNC_NONE), delaying page cache flushing to the filesystem.
+
Only the owner of the mount may read or write these files.
Interrupting filesystem operations
diff --git a/Documentation/filesystems/fuse/index.rst b/Documentation/filesystems/fuse/index.rst
new file mode 100644
index 000000000000..393a845214da
--- /dev/null
+++ b/Documentation/filesystems/fuse/index.rst
@@ -0,0 +1,14 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================================
+FUSE (Filesystem in Userspace) Technical Documentation
+======================================================
+
+.. toctree::
+ :maxdepth: 2
+ :numbered:
+
+ fuse
+ fuse-io
+ fuse-io-uring
+ fuse-passthrough
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 622187a96bdc..af516e528ded 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -95,10 +95,7 @@ Documentation for filesystem implementations.
hfs
hfsplus
hpfs
- fuse
- fuse-io
- fuse-io-uring
- fuse-passthrough
+ fuse/index
inotify
isofs
nilfs2
diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst
index e149b89118c8..c99ab1f7fea4 100644
--- a/Documentation/filesystems/mount_api.rst
+++ b/Documentation/filesystems/mount_api.rst
@@ -506,8 +506,16 @@ returned.
* ::
+ int vfs_parse_fs_qstr(struct fs_context *fc, const char *key,
+ const struct qstr *value);
+
+ A wrapper around vfs_parse_fs_param() that copies the value string it is
+ passed.
+
+ * ::
+
int vfs_parse_fs_string(struct fs_context *fc, const char *key,
- const char *value, size_t v_size);
+ const char *value);
A wrapper around vfs_parse_fs_param() that copies the value string it is
passed.
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 78c3d07c0c08..7233b04668fc 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1297,3 +1297,15 @@ Several functions are renamed:
- user_path_create -> start_creating_user_path
- user_path_locked_at -> start_removing_user_path_at
- done_path_create -> end_creating_path
+
+---
+
+**mandatory**
+
+Calling conventions for vfs_parse_fs_string() have changed; it does *not*
+take length anymore (value ? strlen(value) : 0 is used). If you want
+a different length, use
+
+ vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len))
+
+instead.
diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst
index c32993bc83c7..be966f27f284 100644
--- a/Documentation/filesystems/sysfs.rst
+++ b/Documentation/filesystems/sysfs.rst
@@ -321,7 +321,7 @@ span multiple bus types).
fs/ contains a directory for some filesystems. Currently each
filesystem wanting to export attributes must create its own hierarchy
-below fs/ (see ./fuse.rst for an example).
+below fs/ (see fuse/fuse.rst for an example).
module/ contains parameter values and state information for all
loaded system modules, for both builtin and loadable modules.
diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt
index 547062759e60..b17c9f638628 100644
--- a/Documentation/translations/zh_CN/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt
@@ -282,7 +282,7 @@ drivers/ 包å«äº†æ¯ä¸ªå·²ä¸ºç‰¹å®šæ€»çº¿ä¸Šçš„设备而挂载的驱动程åºçš
å‡å®šé©±åŠ¨æ²¡æœ‰è·¨è¶Šå¤šä¸ªæ€»çº¿ç±»åž‹)。
fs/ 包å«äº†ä¸€ä¸ªä¸ºæ–‡ä»¶ç³»ç»Ÿè®¾ç«‹çš„目录。现在æ¯ä¸ªæƒ³è¦å¯¼å‡ºå±žæ€§çš„æ–‡ä»¶ç³»ç»Ÿå¿…é¡»
-在 fs/ 下创建自己的层次结构(å‚è§Documentation/filesystems/fuse.rst)。
+在 fs/ 下创建自己的层次结构(å‚è§Documentation/filesystems/fuse/fuse.rst)。
dev/ 包å«ä¸¤ä¸ªå­ç›®å½•: char/ å’Œ block/。在这两个å­ç›®å½•中,有以
<major>:<minor> æ ¼å¼å‘½å的符å·é“¾æŽ¥ã€‚这些符å·é“¾æŽ¥æŒ‡å‘ sysfs 目录
diff --git a/Documentation/translations/zh_TW/filesystems/sysfs.txt b/Documentation/translations/zh_TW/filesystems/sysfs.txt
index 978462d5fe14..d1cee02ef1de 100644
--- a/Documentation/translations/zh_TW/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_TW/filesystems/sysfs.txt
@@ -285,7 +285,7 @@ drivers/ 包å«äº†æ¯å€‹å·²çˆ²ç‰¹å®šç¸½ç·šä¸Šçš„設備而掛載的驅動程åºçš
å‡å®šé©…動沒有跨越多個總線類型)。
fs/ 包å«äº†ä¸€å€‹çˆ²æ–‡ä»¶ç³»çµ±è¨­ç«‹çš„目錄。ç¾åœ¨æ¯å€‹æƒ³è¦å°Žå‡ºå±¬æ€§çš„æ–‡ä»¶ç³»çµ±å¿…é ˆ
-在 fs/ ä¸‹å‰µå»ºè‡ªå·±çš„å±¤æ¬¡çµæ§‹(åƒè¦‹Documentation/filesystems/fuse.rst)。
+在 fs/ ä¸‹å‰µå»ºè‡ªå·±çš„å±¤æ¬¡çµæ§‹(åƒè¦‹Documentation/filesystems/fuse/fuse.rst)。
dev/ 包å«å…©å€‹å­ç›®éŒ„: char/ å’Œ block/。在這兩個å­ç›®éŒ„中,有以
<major>:<minor> æ ¼å¼å‘½åçš„ç¬¦è™ŸéˆæŽ¥ã€‚é€™äº›ç¬¦è™ŸéˆæŽ¥æŒ‡å‘ sysfs 目錄
diff --git a/MAINTAINERS b/MAINTAINERS
index 6a6cf856c98b..fa3e51d84399 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10208,9 +10208,10 @@ L: linux-fsdevel@vger.kernel.org
S: Maintained
W: https://github.com/libfuse/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git
-F: Documentation/filesystems/fuse*
+F: Documentation/filesystems/fuse/*
F: fs/fuse/
F: include/uapi/linux/fuse.h
+F: tools/testing/selftests/filesystems/fuse/
FUTEX SUBSYSTEM
M: Thomas Gleixner <tglx@linutronix.de>
@@ -20987,7 +20988,7 @@ F: Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml
F: drivers/pmdomain/qcom/cpr.c
QUALCOMM CPUCP MAILBOX DRIVER
-M: Sibi Sankar <quic_sibis@quicinc.com>
+M: Sibi Sankar <sibi.sankar@oss.qualcomm.com>
L: linux-arm-msm@vger.kernel.org
S: Supported
F: Documentation/devicetree/bindings/mailbox/qcom,cpucp-mbox.yaml
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 44e7aedac6e8..90e7a9539102 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -107,7 +107,7 @@ struct vm_area_struct;
#define _PAGE_NORMAL(x) __pgprot(_PAGE_VALID | __ACCESS_BITS | (x))
-#define _PAGE_P(x) _PAGE_NORMAL((x) | (((x) & _PAGE_FOW)?0:_PAGE_FOW))
+#define _PAGE_P(x) _PAGE_NORMAL((x) | _PAGE_FOW)
#define _PAGE_S(x) _PAGE_NORMAL(x)
/*
@@ -126,34 +126,11 @@ struct vm_area_struct;
#define pgprot_noncached(prot) (prot)
/*
- * BAD_PAGETABLE is used when we need a bogus page-table, while
- * BAD_PAGE is used for a bogus page.
- *
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
*/
-extern pte_t __bad_page(void);
-extern pmd_t * __bad_pagetable(void);
-
-extern unsigned long __zero_page(void);
-
-#define BAD_PAGETABLE __bad_pagetable()
-#define BAD_PAGE __bad_page()
#define ZERO_PAGE(vaddr) (virt_to_page(ZERO_PGE))
-/* number of bits that fit into a memory pointer */
-#define BITS_PER_PTR (8*sizeof(unsigned long))
-
-/* to align the pointer to a pointer address */
-#define PTR_MASK (~(sizeof(void*)-1))
-
-/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */
-#define SIZEOF_PTR_LOG2 3
-
-/* to find an entry in a page-table */
-#define PAGE_PTR(address) \
- ((unsigned long)(address)>>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK)
-
/*
* On certain platforms whose physical address space can overlap KSEG,
* namely EV6 and above, we must re-twiddle the physaddr to restore the
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 2d491b8cdab9..4c5ab9cd8a0a 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -60,33 +60,6 @@ pgd_alloc(struct mm_struct *mm)
}
-/*
- * BAD_PAGE is the page that is used for page faults when linux
- * is out-of-memory. Older versions of linux just did a
- * do_exit(), but using this instead means there is less risk
- * for a process dying in kernel mode, possibly leaving an inode
- * unused etc..
- *
- * BAD_PAGETABLE is the accompanying page-table: it is initialized
- * to point to BAD_PAGE entries.
- *
- * ZERO_PAGE is a special page that is used for zero-initialized
- * data and COW.
- */
-pmd_t *
-__bad_pagetable(void)
-{
- memset(absolute_pointer(EMPTY_PGT), 0, PAGE_SIZE);
- return (pmd_t *) EMPTY_PGT;
-}
-
-pte_t
-__bad_page(void)
-{
- memset(absolute_pointer(EMPTY_PGE), 0, PAGE_SIZE);
- return pte_mkdirty(mk_pte(virt_to_page(EMPTY_PGE), PAGE_SHARED));
-}
-
static inline unsigned long
load_PCB(struct pcb_struct *pcb)
{
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index a885518ce1dd..a6ca7dff4215 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -277,7 +277,7 @@ retry:
if (fault & VM_FAULT_COMPLETED)
return;
- if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(fault & VM_FAULT_RETRY)) {
flags |= FAULT_FLAG_TRIED;
/*
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index 62f2ff4e6799..bba64a9c49ac 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -119,16 +119,6 @@ extern void *empty_zero_page;
*/
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-/* number of bits that fit into a memory pointer */
-#define BITS_PER_PTR (8*sizeof(unsigned long))
-
-/* to align the pointer to a pointer address */
-#define PTR_MASK (~(sizeof(void*)-1))
-
-/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */
-/* 64-bit machines, beware! SRB. */
-#define SIZEOF_PTR_LOG2 2
-
extern void kernel_set_cachemode(void *addr, unsigned long size, int cmode);
/*
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index a60e8d895102..4eb76de6be4a 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -99,7 +99,6 @@ extern pte_t *va_to_pte(unsigned long address);
#define PTRS_PER_PGD (1 << (32 - PGDIR_SHIFT))
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_PGD_NR 0
#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index d33702831505..b218050e2f6d 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -183,23 +183,6 @@ extern void paging_init(void);
extern unsigned long empty_zero_page[2048];
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-/* number of bits that fit into a memory pointer */
-#define BITS_PER_PTR (8*sizeof(unsigned long))
-
-/* to align the pointer to a pointer address */
-#define PTR_MASK (~(sizeof(void *)-1))
-
-/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */
-/* 64-bit machines, beware! SRB. */
-#define SIZEOF_PTR_LOG2 2
-
-/* to find an entry in a page-table */
-#define PAGE_PTR(address) \
-((unsigned long)(address)>>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK)
-
-/* to set the page-dir */
-#define SET_PAGE_DIR(tsk, pgdir)
-
#define pte_none(x) (!pte_val(x))
#define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
#define pte_clear(mm, addr, xp) do { pte_val(*(xp)) = 0; } while (0)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c6b12bed173d..335fd2ee9766 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -165,14 +165,23 @@ static struct crash_mem *fill_up_crash_elf_data(void)
/*
* Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
* may cause range splits. So add extra slots here.
+ *
+ * Exclusion of low 1M may not cause another range split, because the
+ * range of exclude is [0, 1M] and the condition for splitting a new
+ * region is that the start, end parameters are both in a certain
+ * existing region in cmem and cannot be equal to existing region's
+ * start or end. Obviously, the start of [0, 1M] cannot meet this
+ * condition.
+ *
+ * But in order to lest the low 1M could be changed in the future,
+ * (e.g. [start, 1M]), add a extra slot.
*/
- nr_ranges += 2 + crashk_cma_cnt;
+ nr_ranges += 3 + crashk_cma_cnt;
cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return NULL;
cmem->max_nr_ranges = nr_ranges;
- cmem->nr_ranges = 0;
return cmem;
}
@@ -323,16 +332,20 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
struct crash_mem *cmem;
/*
- * Using random kexec_buf for passing dm crypt keys may cause a range
- * split. So use two slots here.
+ * In the current x86 architecture code, the elfheader is always
+ * allocated at crashk_res.start. But it depends on the allocation
+ * position of elfheader in crashk_res. To avoid potential out of
+ * bounds in future, add an extra slot.
+ *
+ * And using random kexec_buf for passing dm crypt keys may cause a
+ * range split too, add another extra slot here.
*/
- nr_ranges = 2;
+ nr_ranges = 3;
cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return -ENOMEM;
cmem->max_nr_ranges = nr_ranges;
- cmem->nr_ranges = 0;
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 24a41f0e0cf1..c3244ac680d1 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -16,6 +16,8 @@
#include <linux/kexec.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/libfdt.h>
+#include <linux/of_fdt.h>
#include <linux/efi.h>
#include <linux/random.h>
@@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
}
#endif /* CONFIG_EFI */
+#ifdef CONFIG_OF_FLATTREE
+static void setup_dtb(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int dtb_setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + dtb_setup_data_offset;
+ unsigned long setup_data_phys, dtb_len;
+
+ dtb_len = fdt_totalsize(initial_boot_params);
+ sd->type = SETUP_DTB;
+ sd->len = dtb_len;
+
+ /* Carry over current boot DTB with setup_data */
+ memcpy(sd->data, initial_boot_params, dtb_len);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + dtb_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+}
+#endif /* CONFIG_OF_FLATTREE */
+
static void
setup_ima_state(const struct kimage *image, struct boot_params *params,
unsigned long params_load_addr,
@@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
sizeof(struct efi_setup_data);
#endif
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params) {
+ setup_dtb(params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+ } else {
+ pr_debug("Not carrying over DTB, force_dtb = %d\n",
+ image->force_dtb);
+ }
+#endif
+
if (IS_ENABLED(CONFIG_IMA_KEXEC)) {
/* Setup IMA log buffer state */
setup_ima_state(image, params, params_load_addr,
@@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
sizeof(struct setup_data) +
RNG_SEED_LENGTH;
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params)
+ kbuf.bufsz += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+#endif
+
if (IS_ENABLED(CONFIG_IMA_KEXEC))
kbuf.bufsz += sizeof(struct setup_data) +
sizeof(struct ima_setup_data);
@@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz += sizeof(struct setup_data) +
sizeof(struct kho_data);
- params = kzalloc(kbuf.bufsz, GFP_KERNEL);
+ params = kvzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
return ERR_PTR(-ENOMEM);
efi_map_offset = params_cmdline_sz;
@@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
return ldata;
out_free_params:
- kfree(params);
+ kvfree(params);
return ERR_PTR(ret);
}
@@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data)
if (!ldata)
return 0;
- kfree(ldata->bootparams_buf);
+ kvfree(ldata->bootparams_buf);
ldata->bootparams_buf = NULL;
return 0;
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index d6eb695f2b26..50a136213b2b 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -58,7 +58,6 @@
#define PTRS_PER_PTE_SHIFT 10
#define PTRS_PER_PGD 1024
#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT)
#ifdef CONFIG_MMU
/*
diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c
index a00e07b853f2..a65c2d5b9e7b 100644
--- a/drivers/firmware/efi/efi-init.c
+++ b/drivers/firmware/efi/efi-init.c
@@ -12,6 +12,7 @@
#include <linux/efi.h>
#include <linux/fwnode.h>
#include <linux/init.h>
+#include <linux/kexec_handover.h>
#include <linux/memblock.h>
#include <linux/mm_types.h>
#include <linux/of.h>
@@ -164,12 +165,32 @@ static __init void reserve_regions(void)
pr_info("Processing EFI memory map:\n");
/*
- * Discard memblocks discovered so far: if there are any at this
- * point, they originate from memory nodes in the DT, and UEFI
- * uses its own memory map instead.
+ * Discard memblocks discovered so far except for KHO scratch
+ * regions. Most memblocks at this point originate from memory nodes
+ * in the DT and UEFI uses its own memory map instead. However, if
+ * KHO is enabled, scratch regions, which are good known memory
+ * must be preserved.
*/
memblock_dump_all();
- memblock_remove(0, PHYS_ADDR_MAX);
+
+ if (is_kho_boot()) {
+ struct memblock_region *r;
+
+ /* Remove all non-KHO regions */
+ for_each_mem_region(r) {
+ if (!memblock_is_kho_scratch(r)) {
+ memblock_remove(r->base, r->size);
+ r--;
+ }
+ }
+ } else {
+ /*
+ * KHO is disabled. Discard memblocks discovered so far:
+ * if there are any at this point, they originate from memory
+ * nodes in the DT, and UEFI uses its own memory map instead.
+ */
+ memblock_remove(0, PHYS_ADDR_MAX);
+ }
for_each_efi_memory_desc(md) {
paddr = md->phys_addr;
diff --git a/drivers/gpu/drm/i915/gem/i915_gemfs.c b/drivers/gpu/drm/i915/gem/i915_gemfs.c
index a09e2eb47175..8f13ec4ff0d0 100644
--- a/drivers/gpu/drm/i915/gem/i915_gemfs.c
+++ b/drivers/gpu/drm/i915/gem/i915_gemfs.c
@@ -11,11 +11,6 @@
#include "i915_gemfs.h"
#include "i915_utils.h"
-static int add_param(struct fs_context *fc, const char *key, const char *val)
-{
- return vfs_parse_fs_string(fc, key, val, strlen(val));
-}
-
void i915_gemfs_init(struct drm_i915_private *i915)
{
struct file_system_type *type;
@@ -48,9 +43,9 @@ void i915_gemfs_init(struct drm_i915_private *i915)
fc = fs_context_for_mount(type, SB_KERNMOUNT);
if (IS_ERR(fc))
goto err;
- ret = add_param(fc, "source", "tmpfs");
+ ret = vfs_parse_fs_string(fc, "source", "tmpfs");
if (!ret)
- ret = add_param(fc, "huge", "within_size");
+ ret = vfs_parse_fs_string(fc, "huge", "within_size");
if (!ret)
gemfs = fc_mount_longterm(fc);
put_fs_context(fc);
diff --git a/drivers/gpu/drm/v3d/v3d_gemfs.c b/drivers/gpu/drm/v3d/v3d_gemfs.c
index 8ec6ed82b3d9..c1a30166c099 100644
--- a/drivers/gpu/drm/v3d/v3d_gemfs.c
+++ b/drivers/gpu/drm/v3d/v3d_gemfs.c
@@ -7,11 +7,6 @@
#include "v3d_drv.h"
-static int add_param(struct fs_context *fc, const char *key, const char *val)
-{
- return vfs_parse_fs_string(fc, key, val, strlen(val));
-}
-
void v3d_gemfs_init(struct v3d_dev *v3d)
{
struct file_system_type *type;
@@ -38,9 +33,9 @@ void v3d_gemfs_init(struct v3d_dev *v3d)
fc = fs_context_for_mount(type, SB_KERNMOUNT);
if (IS_ERR(fc))
goto err;
- ret = add_param(fc, "source", "tmpfs");
+ ret = vfs_parse_fs_string(fc, "source", "tmpfs");
if (!ret)
- ret = add_param(fc, "huge", "within_size");
+ ret = vfs_parse_fs_string(fc, "huge", "within_size");
if (!ret)
gemfs = fc_mount_longterm(fc);
put_fs_context(fc);
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 5940e2eb9231..96cc9b389246 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -279,14 +279,7 @@ static int fbcon_get_rotate(struct fb_info *info)
static bool fbcon_skip_panic(struct fb_info *info)
{
-/* panic_cpu is not exported, and can't be used if built as module. Use
- * oops_in_progress instead, but non-fatal oops won't be printed.
- */
-#if defined(MODULE)
- return (info->skip_panic && unlikely(oops_in_progress));
-#else
- return (info->skip_panic && unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID));
-#endif
+ return (info->skip_panic && unlikely(panic_in_progress()));
}
static inline bool fbcon_is_active(struct vc_data *vc, struct fb_info *info)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 399d455d50d6..d0c77ec31b1d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -768,22 +768,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct v9fs_inode __maybe_unused *v9inode;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
- struct dentry *res = NULL;
struct inode *inode;
int p9_omode;
if (d_in_lookup(dentry)) {
- res = v9fs_vfs_lookup(dir, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode);
@@ -795,17 +791,17 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
"write-only file with writeback enabled, creating w/ O_RDWR\n");
}
fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- goto error;
- }
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
inode = d_inode(dentry);
v9inode = V9FS_I(inode);
err = finish_open(file, dentry, generic_file_open);
- if (err)
- goto error;
+ if (unlikely(err)) {
+ p9_fid_put(fid);
+ return err;
+ }
file->private_data = fid;
#ifdef CONFIG_9P_FSCACHE
@@ -818,13 +814,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_open_fid_add(inode, &fid);
file->f_mode |= FMODE_CREATED;
-out:
- dput(res);
- return err;
-
-error:
- p9_fid_put(fid);
- goto out;
+ return 0;
}
/**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5b5fda617b80..be297e335468 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -238,20 +238,16 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct p9_fid *dfid = NULL, *ofid = NULL;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
- struct dentry *res = NULL;
if (d_in_lookup(dentry)) {
- res = v9fs_vfs_lookup(dir, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
v9ses = v9fs_inode2v9ses(dir);
@@ -337,7 +333,6 @@ out:
p9_fid_put(ofid);
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- dput(res);
return err;
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 4b1342c72089..fd3aa9f97ce6 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -239,7 +239,7 @@ static void afs_edit_init_block(union afs_xdr_dir_block *meta,
* The caller must hold the inode locked.
*/
void afs_edit_dir_add(struct afs_vnode *vnode,
- struct qstr *name, struct afs_fid *new_fid,
+ const struct qstr *name, struct afs_fid *new_fid,
enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block;
@@ -391,7 +391,7 @@ error:
* The caller must hold the inode locked.
*/
void afs_edit_dir_remove(struct afs_vnode *vnode,
- struct qstr *name, enum afs_edit_dir_reason why)
+ const struct qstr *name, enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block, *pblock;
union afs_xdr_dirent *de, *pde;
diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c
index b25bd892db4d..d2516e55b5ed 100644
--- a/fs/afs/dir_search.c
+++ b/fs/afs/dir_search.c
@@ -188,7 +188,7 @@ bad:
/*
* Search the appropriate hash chain in the contents of an AFS directory.
*/
-int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version)
{
struct afs_dir_iter iter = { .dvnode = dvnode, };
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 444a3ea4fdf6..a45ae5c2ef8a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1099,9 +1099,9 @@ int afs_single_writepages(struct address_space *mapping,
/*
* dir_edit.c
*/
-extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
+extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *,
enum afs_edit_dir_reason);
-extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
+extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason);
void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why);
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
@@ -1114,7 +1114,7 @@ bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name);
union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block);
int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
struct afs_fid *_fid);
-int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version);
/*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 9434a5399f2b..1ad048e6e164 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -137,7 +137,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ret = -EINVAL;
if (content[size - 1] == '.')
- ret = vfs_parse_fs_string(fc, "source", content, size - 1);
+ ret = vfs_parse_fs_qstr(fc, "source",
+ &QSTR_LEN(content, size - 1));
do_delayed_call(&cleanup);
if (ret < 0)
return ret;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index f327fbb9a0ca..81f4f06bc87e 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1601,10 +1601,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
err = -ENOENT;
if (configfs_dirent_is_ready(parent_sd)) {
file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL);
- if (IS_ERR(file->private_data))
- err = PTR_ERR(file->private_data);
- else
- err = 0;
+ err = PTR_ERR_OR_ZERO(file->private_data);
}
inode_unlock(d_inode(dentry));
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 12daa85ed941..ca54bf24b719 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -421,7 +421,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
vmf = vmf_insert_mixed(vma, vma->vm_start + off,
- address + off);
+ PHYS_PFN(address + off));
if (vmf & VM_FAULT_ERROR)
ret = vm_fault_to_errno(vmf, 0);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 65cc11939654..a067fa0a965a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1390,6 +1390,7 @@ struct check_mount {
unsigned int mounted;
};
+/* locks: mount_locked_reader && dentry->d_lock */
static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
struct check_mount *info = data;
@@ -1416,9 +1417,8 @@ int path_has_submounts(const struct path *parent)
{
struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
- read_seqlock_excl(&mount_lock);
+ guard(mount_locked_reader)();
d_walk(parent->dentry, &data, path_check_mount);
- read_sequnlock_excl(&mount_lock);
return data.mounted;
}
@@ -1717,13 +1717,13 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_shortname.string;
}
- dentry->d_name.len = name->len;
- dentry->d_name.hash = name->hash;
+ dentry->__d_name.len = name->len;
+ dentry->__d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
/* Make sure we always see the terminating NUL character */
- smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
+ smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */
dentry->d_flags = 0;
lockref_init(&dentry->d_lockref);
@@ -2743,15 +2743,15 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
/*
* Both external: swap the pointers
*/
- swap(target->d_name.name, dentry->d_name.name);
+ swap(target->__d_name.name, dentry->__d_name.name);
} else {
/*
* dentry:internal, target:external. Steal target's
* storage and make target internal.
*/
- dentry->d_name.name = target->d_name.name;
+ dentry->__d_name.name = target->__d_name.name;
target->d_shortname = dentry->d_shortname;
- target->d_name.name = target->d_shortname.string;
+ target->__d_name.name = target->d_shortname.string;
}
} else {
if (unlikely(dname_external(dentry))) {
@@ -2759,9 +2759,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
* dentry:external, target:internal. Give dentry's
* storage to target and make dentry internal
*/
- target->d_name.name = dentry->d_name.name;
+ target->__d_name.name = dentry->__d_name.name;
dentry->d_shortname = target->d_shortname;
- dentry->d_name.name = dentry->d_shortname.string;
+ dentry->__d_name.name = dentry->d_shortname.string;
} else {
/*
* Both are internal.
@@ -2771,7 +2771,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
target->d_shortname.words[i]);
}
}
- swap(dentry->d_name.hash_len, target->d_name.hash_len);
+ swap(dentry->__d_name.hash_len, target->__d_name.hash_len);
}
static void copy_name(struct dentry *dentry, struct dentry *target)
@@ -2781,11 +2781,11 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
old_name = external_name(dentry);
if (unlikely(dname_external(target))) {
atomic_inc(&external_name(target)->count);
- dentry->d_name = target->d_name;
+ dentry->__d_name = target->__d_name;
} else {
dentry->d_shortname = target->d_shortname;
- dentry->d_name.name = dentry->d_shortname.string;
- dentry->d_name.hash_len = target->d_name.hash_len;
+ dentry->__d_name.name = dentry->d_shortname.string;
+ dentry->__d_name.hash_len = target->__d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->count)))
kfree_rcu(old_name, head);
@@ -3134,7 +3134,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode)
!d_unlinked(dentry));
spin_lock(&dentry->d_parent->d_lock);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
+ dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
(unsigned long long)inode->i_ino);
spin_unlock(&dentry->d_lock);
spin_unlock(&dentry->d_parent->d_lock);
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 1dfd5b81d831..6648a924e31a 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name,
return rc;
}
-struct kmem_cache *ecryptfs_dentry_info_cache;
-
-static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
-{
- kmem_cache_free(ecryptfs_dentry_info_cache,
- container_of(head, struct ecryptfs_dentry_info, rcu));
-}
-
/**
* ecryptfs_d_release
* @dentry: The ecryptfs dentry
@@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
*/
static void ecryptfs_d_release(struct dentry *dentry)
{
- struct ecryptfs_dentry_info *p = dentry->d_fsdata;
- if (p) {
- path_put(&p->lower_path);
- call_rcu(&p->rcu, ecryptfs_dentry_free_rcu);
- }
+ dput(dentry->d_fsdata);
}
const struct dentry_operations ecryptfs_dops = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 1f562e75d0e4..9e6ab0b41337 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -258,13 +258,6 @@ struct ecryptfs_inode_info {
struct ecryptfs_crypt_stat crypt_stat;
};
-/* dentry private data. Each dentry must keep track of a lower
- * vfsmount too. */
-struct ecryptfs_dentry_info {
- struct path lower_path;
- struct rcu_head rcu;
-};
-
/**
* ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint
* @flags: Status flags
@@ -348,6 +341,7 @@ struct ecryptfs_mount_crypt_stat {
/* superblock private data. */
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
+ struct vfsmount *lower_mnt;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
};
@@ -494,22 +488,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb,
}
static inline void
-ecryptfs_set_dentry_private(struct dentry *dentry,
- struct ecryptfs_dentry_info *dentry_info)
+ecryptfs_set_dentry_lower(struct dentry *dentry,
+ struct dentry *lower_dentry)
{
- dentry->d_fsdata = dentry_info;
+ dentry->d_fsdata = lower_dentry;
}
static inline struct dentry *
ecryptfs_dentry_to_lower(struct dentry *dentry)
{
- return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
+ return dentry->d_fsdata;
}
-static inline const struct path *
-ecryptfs_dentry_to_lower_path(struct dentry *dentry)
+static inline struct path
+ecryptfs_lower_path(struct dentry *dentry)
{
- return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
+ return (struct path){
+ .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt,
+ .dentry = ecryptfs_dentry_to_lower(dentry)
+ };
}
#define ecryptfs_printk(type, fmt, arg...) \
@@ -532,7 +529,6 @@ extern unsigned int ecryptfs_number_of_users;
extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
extern struct kmem_cache *ecryptfs_file_info_cache;
-extern struct kmem_cache *ecryptfs_dentry_info_cache;
extern struct kmem_cache *ecryptfs_inode_info_cache;
extern struct kmem_cache *ecryptfs_sb_info_cache;
extern struct kmem_cache *ecryptfs_header_cache;
@@ -557,7 +553,6 @@ int ecryptfs_encrypt_and_encode_filename(
size_t *encoded_name_size,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size);
-struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
void ecryptfs_dump_hex(char *data, int bytes);
int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
int sg_size);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 5f8f96da09fe..7929411837cf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct iov_iter *to)
{
ssize_t rc;
- const struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(file->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos,
size_t len, unsigned int flags)
{
ssize_t rc;
- const struct path *path;
rc = filemap_splice_read(in, ppos, pipe, len, flags);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(in->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(in->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
* ecryptfs_lookup() */
struct ecryptfs_file_info *file_info;
struct file *lower_file;
+ struct path path;
/* Released in ecryptfs_release or end of function if failure */
file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
@@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
"Error attempting to allocate memory\n");
return -ENOMEM;
}
- lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
- file->f_flags, current_cred());
+ path = ecryptfs_lower_path(ecryptfs_dentry);
+ lower_file = dentry_open(&path, file->f_flags, current_cred());
if (IS_ERR(lower_file)) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index abd954c6a14e..ed1394da8d6b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -327,24 +327,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent);
struct inode *inode, *lower_inode;
- struct ecryptfs_dentry_info *dentry_info;
int rc = 0;
- dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!dentry_info) {
- dput(lower_dentry);
- return ERR_PTR(-ENOMEM);
- }
-
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(path->dentry));
+ d_inode(lower_parent));
BUG_ON(!d_count(lower_dentry));
- ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = mntget(path->mnt);
- dentry_info->lower_path.dentry = lower_dentry;
+ ecryptfs_set_dentry_lower(dentry, lower_dentry);
/*
* negative dentry can go positive under us here - its parent is not
@@ -1021,10 +1012,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
{
struct dentry *dentry = path->dentry;
struct kstat lower_stat;
+ struct path lower_path = ecryptfs_lower_path(dentry);
int rc;
- rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
- &lower_stat, request_mask, flags);
+ rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index eab1beb846d3..16ea14dd2c62 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -106,15 +106,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+ struct path path = ecryptfs_lower_path(dentry);
int rc;
- rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
- cred);
+ rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred);
if (rc) {
printk(KERN_ERR "Error opening lower file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
- "rc = [%d]\n", path->dentry, path->mnt, rc);
+ "rc = [%d]\n", path.dentry, path.mnt, rc);
(*lower_file) = NULL;
}
return rc;
@@ -437,7 +436,6 @@ static int ecryptfs_get_tree(struct fs_context *fc)
struct ecryptfs_fs_context *ctx = fc->fs_private;
struct ecryptfs_sb_info *sbi = fc->s_fs_info;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- struct ecryptfs_dentry_info *root_info;
const char *err = "Getting sb failed";
struct inode *inode;
struct path path;
@@ -543,14 +541,8 @@ static int ecryptfs_get_tree(struct fs_context *fc)
goto out_free;
}
- rc = -ENOMEM;
- root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!root_info)
- goto out_free;
-
- /* ->kill_sb() will take care of root_info */
- ecryptfs_set_dentry_private(s->s_root, root_info);
- root_info->lower_path = path;
+ ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+ ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt;
s->s_flags |= SB_ACTIVE;
fc->root = dget(s->s_root);
@@ -580,6 +572,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
kill_anon_super(sb);
if (!sb_info)
return;
+ mntput(sb_info->lower_mnt);
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
@@ -668,11 +661,6 @@ static struct ecryptfs_cache_info {
.size = sizeof(struct ecryptfs_file_info),
},
{
- .cache = &ecryptfs_dentry_info_cache,
- .name = "ecryptfs_dentry_info_cache",
- .size = sizeof(struct ecryptfs_dentry_info),
- },
- {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index cc01556c9d9b..2d2d510f2372 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/bitmap.h>
#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -26,13 +27,58 @@
/*
* Allocation Bitmap Management Functions
*/
+static bool exfat_test_bitmap_range(struct super_block *sb, unsigned int clu,
+ unsigned int count)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int start = clu;
+ unsigned int end = clu + count;
+ unsigned int ent_idx, i, b;
+ unsigned int bit_offset, bits_to_check;
+ __le_long *bitmap_le;
+ unsigned long mask, word;
+
+ if (!is_valid_cluster(sbi, start) || !is_valid_cluster(sbi, end - 1))
+ return false;
+
+ while (start < end) {
+ ent_idx = CLUSTER_TO_BITMAP_ENT(start);
+ i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+ b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+
+ bitmap_le = (__le_long *)sbi->vol_amap[i]->b_data;
+
+ /* Calculate how many bits we can check in the current word */
+ bit_offset = b % BITS_PER_LONG;
+ bits_to_check = min(end - start,
+ (unsigned int)(BITS_PER_LONG - bit_offset));
+
+ /* Create a bitmask for the range of bits to check */
+ if (bits_to_check >= BITS_PER_LONG)
+ mask = ~0UL;
+ else
+ mask = ((1UL << bits_to_check) - 1) << bit_offset;
+ word = lel_to_cpu(bitmap_le[b / BITS_PER_LONG]);
+
+ /* Check if all bits in the mask are set */
+ if ((word & mask) != mask)
+ return false;
+
+ start += bits_to_check;
+ }
+
+ return true;
+}
+
static int exfat_allocate_bitmap(struct super_block *sb,
struct exfat_dentry *ep)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct blk_plug plug;
long long map_size;
- unsigned int i, need_map_size;
+ unsigned int i, j, need_map_size;
sector_t sector;
+ unsigned int max_ra_count;
sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu);
map_size = le64_to_cpu(ep->dentry.bitmap.size);
@@ -56,22 +102,37 @@ static int exfat_allocate_bitmap(struct super_block *sb,
return -ENOMEM;
sector = exfat_cluster_to_sector(sbi, sbi->map_clu);
+ max_ra_count = min(sb->s_bdi->ra_pages, sb->s_bdi->io_pages) <<
+ (PAGE_SHIFT - sb->s_blocksize_bits);
for (i = 0; i < sbi->map_sectors; i++) {
- sbi->vol_amap[i] = sb_bread(sb, sector + i);
- if (!sbi->vol_amap[i]) {
- /* release all buffers and free vol_amap */
- int j = 0;
-
- while (j < i)
- brelse(sbi->vol_amap[j++]);
-
- kvfree(sbi->vol_amap);
- sbi->vol_amap = NULL;
- return -EIO;
+ /* Trigger the next readahead in advance. */
+ if (0 == (i % max_ra_count)) {
+ blk_start_plug(&plug);
+ for (j = i; j < min(max_ra_count, sbi->map_sectors - i) + i; j++)
+ sb_breadahead(sb, sector + j);
+ blk_finish_plug(&plug);
}
+
+ sbi->vol_amap[i] = sb_bread(sb, sector + i);
+ if (!sbi->vol_amap[i])
+ goto err_out;
}
+ if (exfat_test_bitmap_range(sb, sbi->map_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(map_size, sbi)) == false)
+ goto err_out;
+
return 0;
+
+err_out:
+ j = 0;
+ /* release all buffers and free vol_amap */
+ while (j < i)
+ brelse(sbi->vol_amap[j++]);
+
+ kvfree(sbi->vol_amap);
+ sbi->vol_amap = NULL;
+ return -EIO;
}
int exfat_load_bitmap(struct super_block *sb)
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index ee060e26f51d..7229146fe2bf 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -1244,3 +1244,163 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
return count;
}
+
+static int exfat_get_volume_label_dentry(struct super_block *sb,
+ struct exfat_entry_set_cache *es)
+{
+ int i;
+ int dentry = 0;
+ unsigned int type;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_hint_femp hint_femp;
+ struct exfat_inode_info *ei = EXFAT_I(sb->s_root->d_inode);
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+
+ hint_femp.eidx = EXFAT_HINT_NONE;
+ exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ for (i = 0; i < sbi->dentries_per_clu; i++, dentry++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
+ if (!ep)
+ return -EIO;
+
+ type = exfat_get_entry_type(ep);
+ if (hint_femp.eidx == EXFAT_HINT_NONE) {
+ if (type == TYPE_DELETED || type == TYPE_UNUSED) {
+ hint_femp.cur = clu;
+ hint_femp.eidx = dentry;
+ hint_femp.count = 1;
+ }
+ }
+
+ if (type == TYPE_UNUSED) {
+ brelse(bh);
+ goto not_found;
+ }
+
+ if (type != TYPE_VOLUME) {
+ brelse(bh);
+ continue;
+ }
+
+ memset(es, 0, sizeof(*es));
+ es->sb = sb;
+ es->bh = es->__bh;
+ es->bh[0] = bh;
+ es->num_bh = 1;
+ es->start_off = EXFAT_DEN_TO_B(i) % sb->s_blocksize;
+
+ return 0;
+ }
+
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+ }
+
+not_found:
+ if (hint_femp.eidx == EXFAT_HINT_NONE) {
+ hint_femp.cur.dir = EXFAT_EOF_CLUSTER;
+ hint_femp.eidx = dentry;
+ hint_femp.count = 0;
+ }
+
+ ei->hint_femp = hint_femp;
+
+ return -ENOENT;
+}
+
+int exfat_read_volume_label(struct super_block *sb, struct exfat_uni_name *label_out)
+{
+ int ret, i;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_entry_set_cache es;
+ struct exfat_dentry *ep;
+
+ mutex_lock(&sbi->s_lock);
+
+ memset(label_out, 0, sizeof(*label_out));
+ ret = exfat_get_volume_label_dentry(sb, &es);
+ if (ret < 0) {
+ /*
+ * ENOENT signifies that a volume label dentry doesn't exist
+ * We will treat this as an empty volume label and not fail.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+
+ goto unlock;
+ }
+
+ ep = exfat_get_dentry_cached(&es, 0);
+ label_out->name_len = ep->dentry.volume_label.char_count;
+ if (label_out->name_len > EXFAT_VOLUME_LABEL_LEN) {
+ ret = -EIO;
+ exfat_put_dentry_set(&es, false);
+ goto unlock;
+ }
+
+ for (i = 0; i < label_out->name_len; i++)
+ label_out->name[i] = le16_to_cpu(ep->dentry.volume_label.volume_label[i]);
+
+ exfat_put_dentry_set(&es, false);
+unlock:
+ mutex_unlock(&sbi->s_lock);
+ return ret;
+}
+
+int exfat_write_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label)
+{
+ int ret, i;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct inode *root_inode = sb->s_root->d_inode;
+ struct exfat_entry_set_cache es;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+
+ if (label->name_len > EXFAT_VOLUME_LABEL_LEN)
+ return -EINVAL;
+
+ mutex_lock(&sbi->s_lock);
+
+ ret = exfat_get_volume_label_dentry(sb, &es);
+ if (ret == -ENOENT) {
+ if (label->name_len == 0) {
+ /* No volume label dentry, no need to clear */
+ ret = 0;
+ goto unlock;
+ }
+
+ ret = exfat_find_empty_entry(root_inode, &clu, 1, &es);
+ }
+
+ if (ret < 0)
+ goto unlock;
+
+ ep = exfat_get_dentry_cached(&es, 0);
+
+ if (label->name_len == 0 && ep->dentry.volume_label.char_count == 0) {
+ /* volume label had been cleared */
+ exfat_put_dentry_set(&es, 0);
+ goto unlock;
+ }
+
+ memset(ep, 0, sizeof(*ep));
+ ep->type = EXFAT_VOLUME;
+
+ for (i = 0; i < label->name_len; i++)
+ ep->dentry.volume_label.volume_label[i] =
+ cpu_to_le16(label->name[i]);
+
+ ep->dentry.volume_label.char_count = label->name_len;
+ es.modified = true;
+
+ ret = exfat_put_dentry_set(&es, IS_DIRSYNC(root_inode));
+
+unlock:
+ mutex_unlock(&sbi->s_lock);
+ return ret;
+}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index f8ead4d47ef0..329697c89d09 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -477,6 +477,9 @@ int exfat_force_shutdown(struct super_block *sb, u32 flags);
/* namei.c */
extern const struct dentry_operations exfat_dentry_ops;
extern const struct dentry_operations exfat_utf8_dentry_ops;
+int exfat_find_empty_entry(struct inode *inode,
+ struct exfat_chain *p_dir, int num_entries,
+ struct exfat_entry_set_cache *es);
/* cache.c */
int exfat_cache_init(void);
@@ -517,6 +520,10 @@ int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
unsigned int num_entries);
int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
+int exfat_read_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label_out);
+int exfat_write_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label);
/* inode.c */
extern const struct inode_operations exfat_file_inode_operations;
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 971a1ccd0e89..4082fa7b8c14 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -80,6 +80,7 @@
#define BOOTSEC_OLDBPB_LEN 53
#define EXFAT_FILE_NAME_LEN 15
+#define EXFAT_VOLUME_LABEL_LEN 11
#define EXFAT_MIN_SECT_SIZE_BITS 9
#define EXFAT_MAX_SECT_SIZE_BITS 12
@@ -160,6 +161,11 @@ struct exfat_dentry {
__le64 size;
} __packed upcase; /* up-case table directory entry */
struct {
+ __u8 char_count;
+ __le16 volume_label[EXFAT_VOLUME_LABEL_LEN];
+ __u8 reserved[8];
+ } __packed volume_label; /* volume label directory entry */
+ struct {
__u8 flags;
__u8 vendor_guid[16];
__u8 vendor_defined[14];
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 232cc7f8ab92..825083634ba2 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -89,35 +89,36 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc,
int err;
if (!is_valid_cluster(sbi, loc)) {
- exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)",
+ exfat_fs_error_ratelimit(sb,
+ "invalid access to FAT (entry 0x%08x)",
loc);
return -EIO;
}
err = __exfat_ent_get(sb, loc, content);
if (err) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"failed to access to FAT (entry 0x%08x, err:%d)",
loc, err);
return err;
}
if (*content == EXFAT_FREE_CLUSTER) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT free cluster (entry 0x%08x)",
loc);
return -EIO;
}
if (*content == EXFAT_BAD_CLUSTER) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT bad cluster (entry 0x%08x)",
loc);
return -EIO;
}
if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT (entry 0x%08x) bogus content (0x%08x)",
loc, *content);
return -EIO;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 538d2b6ac2ec..f246cf439588 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -486,6 +486,54 @@ static int exfat_ioctl_shutdown(struct super_block *sb, unsigned long arg)
return exfat_force_shutdown(sb, flags);
}
+static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long arg)
+{
+ int ret;
+ char label[FSLABEL_MAX] = {0};
+ struct exfat_uni_name uniname;
+
+ ret = exfat_read_volume_label(sb, &uniname);
+ if (ret < 0)
+ return ret;
+
+ ret = exfat_utf16_to_nls(sb, &uniname, label, uniname.name_len);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((char __user *)arg, label, ret + 1))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int exfat_ioctl_set_volume_label(struct super_block *sb,
+ unsigned long arg)
+{
+ int ret = 0, lossy;
+ char label[FSLABEL_MAX];
+ struct exfat_uni_name uniname;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, (char __user *)arg, FSLABEL_MAX))
+ return -EFAULT;
+
+ memset(&uniname, 0, sizeof(uniname));
+ if (label[0]) {
+ ret = exfat_nls_to_utf16(sb, label, FSLABEL_MAX,
+ &uniname, &lossy);
+ if (ret < 0)
+ return ret;
+ else if (lossy & NLS_NAME_LOSSY)
+ return -EINVAL;
+ }
+
+ uniname.name_len = ret;
+
+ return exfat_write_volume_label(sb, &uniname);
+}
+
long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -500,6 +548,10 @@ long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return exfat_ioctl_shutdown(inode->i_sb, arg);
case FITRIM:
return exfat_ioctl_fitrim(inode, arg);
+ case FS_IOC_GETFSLABEL:
+ return exfat_ioctl_get_volume_label(inode->i_sb, arg);
+ case FS_IOC_SETFSLABEL:
+ return exfat_ioctl_set_volume_label(inode->i_sb, arg);
default:
return -ENOTTY;
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index c10844e1e16c..f9501c3a3666 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -25,7 +25,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+ bool is_dir = (ei->type == TYPE_DIR);
struct timespec64 ts;
if (inode->i_ino == EXFAT_ROOT_INO)
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index f5f1c4e8a29f..7eb9c67fd35f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -300,7 +300,7 @@ static int exfat_check_max_dentries(struct inode *inode)
* the directory entry index in p_dir is returned on succeeds
* -error code is returned on failure
*/
-static int exfat_find_empty_entry(struct inode *inode,
+int exfat_find_empty_entry(struct inode *inode,
struct exfat_chain *p_dir, int num_entries,
struct exfat_entry_set_cache *es)
{
@@ -587,7 +587,7 @@ unlock:
}
/* lookup a file */
-static int exfat_find(struct inode *dir, struct qstr *qname,
+static int exfat_find(struct inode *dir, const struct qstr *qname,
struct exfat_dir_entry *info)
{
int ret, dentry, count;
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 1729bf42eb51..8243d94ceaf4 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -789,7 +789,7 @@ int exfat_create_upcase_table(struct super_block *sb)
return ret;
}
- if (exfat_get_next_cluster(sb, &(clu.dir)))
+ if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8926e63f5bb7..7f9592856bf7 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -31,6 +31,16 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi)
kfree(sbi->options.iocharset);
}
+static void exfat_set_iocharset(struct exfat_mount_options *opts,
+ char *iocharset)
+{
+ opts->iocharset = iocharset;
+ if (!strcmp(opts->iocharset, "utf8"))
+ opts->utf8 = 1;
+ else
+ opts->utf8 = 0;
+}
+
static void exfat_put_super(struct super_block *sb)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -243,11 +253,11 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_u32oct("allow_utime", Opt_allow_utime),
fsparam_string("iocharset", Opt_charset),
fsparam_enum("errors", Opt_errors, exfat_param_enums),
- fsparam_flag("discard", Opt_discard),
+ fsparam_flag_no("discard", Opt_discard),
fsparam_flag("keep_last_dots", Opt_keep_last_dots),
fsparam_flag("sys_tz", Opt_sys_tz),
fsparam_s32("time_offset", Opt_time_offset),
- fsparam_flag("zero_size_dir", Opt_zero_size_dir),
+ fsparam_flag_no("zero_size_dir", Opt_zero_size_dir),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
__fsparam(NULL, "debug", Opt_debug, fs_param_deprecated,
@@ -292,14 +302,14 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_charset:
exfat_free_iocharset(sbi);
- opts->iocharset = param->string;
+ exfat_set_iocharset(opts, param->string);
param->string = NULL;
break;
case Opt_errors:
opts->errors = result.uint_32;
break;
case Opt_discard:
- opts->discard = 1;
+ opts->discard = !result.negated;
break;
case Opt_keep_last_dots:
opts->keep_last_dots = 1;
@@ -317,7 +327,7 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
opts->time_offset = result.int_32;
break;
case Opt_zero_size_dir:
- opts->zero_size_dir = true;
+ opts->zero_size_dir = !result.negated;
break;
case Opt_utf8:
case Opt_debug:
@@ -664,8 +674,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
/* set up enough so that it can read an inode */
exfat_hash_init(sb);
- if (!strcmp(sbi->options.iocharset, "utf8"))
- opts->utf8 = 1;
+ if (sbi->options.utf8)
+ set_default_d_op(sb, &exfat_utf8_dentry_ops);
else {
sbi->nls_io = load_nls(sbi->options.iocharset);
if (!sbi->nls_io) {
@@ -674,12 +684,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
err = -EINVAL;
goto free_table;
}
- }
-
- if (sbi->options.utf8)
- set_default_d_op(sb, &exfat_utf8_dentry_ops);
- else
set_default_d_op(sb, &exfat_dentry_ops);
+ }
root_inode = new_inode(sb);
if (!root_inode) {
@@ -742,12 +748,44 @@ static void exfat_free(struct fs_context *fc)
static int exfat_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
+ struct exfat_sb_info *remount_sbi = fc->s_fs_info;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_mount_options *new_opts = &remount_sbi->options;
+ struct exfat_mount_options *cur_opts = &sbi->options;
+
fc->sb_flags |= SB_NODIRATIME;
sync_filesystem(sb);
- mutex_lock(&EXFAT_SB(sb)->s_lock);
+ mutex_lock(&sbi->s_lock);
exfat_clear_volume_dirty(sb);
- mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ mutex_unlock(&sbi->s_lock);
+
+ if (new_opts->allow_utime == (unsigned short)-1)
+ new_opts->allow_utime = ~new_opts->fs_dmask & 0022;
+
+ /*
+ * Since the old settings of these mount options are cached in
+ * inodes or dentries, they cannot be modified dynamically.
+ */
+ if (strcmp(new_opts->iocharset, cur_opts->iocharset) ||
+ new_opts->keep_last_dots != cur_opts->keep_last_dots ||
+ new_opts->sys_tz != cur_opts->sys_tz ||
+ new_opts->time_offset != cur_opts->time_offset ||
+ !uid_eq(new_opts->fs_uid, cur_opts->fs_uid) ||
+ !gid_eq(new_opts->fs_gid, cur_opts->fs_gid) ||
+ new_opts->fs_fmask != cur_opts->fs_fmask ||
+ new_opts->fs_dmask != cur_opts->fs_dmask ||
+ new_opts->allow_utime != cur_opts->allow_utime)
+ return -EINVAL;
+
+ if (new_opts->discard != cur_opts->discard &&
+ new_opts->discard &&
+ !bdev_max_discard_sectors(sb->s_bdev)) {
+ exfat_warn(sb, "remounting with \"discard\" option, but the device does not support discard");
+ return -EINVAL;
+ }
+
+ swap(*cur_opts, *new_opts);
return 0;
}
@@ -777,8 +815,8 @@ static int exfat_init_fs_context(struct fs_context *fc)
sbi->options.fs_fmask = current->fs->umask;
sbi->options.fs_dmask = current->fs->umask;
sbi->options.allow_utime = -1;
- sbi->options.iocharset = exfat_default_iocharset;
sbi->options.errors = EXFAT_ERRORS_RO;
+ exfat_set_iocharset(&sbi->options, exfat_default_iocharset);
fc->s_fs_info = sbi;
fc->ops = &exfat_context_ops;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c9ca41d91a6c..01873c2a34ad 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,31 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-# Ext3 configs are here for backward compatibility with old configs which may
-# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
-# kernels after the removal of ext3 driver.
-config EXT3_FS
- tristate "The Extended 3 (ext3) filesystem"
- select EXT4_FS
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_POSIX_ACL
- bool "Ext3 POSIX Access Control Lists"
- depends on EXT3_FS
- select EXT4_FS_POSIX_ACL
- select FS_POSIX_ACL
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_SECURITY
- bool "Ext3 Security Labels"
- depends on EXT3_FS
- select EXT4_FS_SECURITY
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
select BUFFER_HEAD
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6cb784a56b3b..57087da6c7be 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1450,7 +1450,9 @@ struct ext4_super_block {
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
- __le32 s_reserved[94]; /* Padding to the end of the block */
+ __le16 s_def_resuid_hi;
+ __le16 s_def_resgid_hi;
+ __le32 s_reserved[93]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1820,6 +1822,18 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+static inline int ext4_get_resuid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resuid) |
+ le16_to_cpu(es->s_def_resuid_hi) << 16;
+}
+
+static inline int ext4_get_resgid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resgid) |
+ le16_to_cpu(es->s_def_resgid_hi) << 16;
+}
+
/*
* Returns: sbi->field[index]
* Used to access an array element from the following sbi fields which require
@@ -1990,6 +2004,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
+ * Check whether the inode is tracked as orphan (either in orphan file or
+ * orphan list).
+ */
+static inline bool ext4_inode_orphan_tracked(struct inode *inode)
+{
+ return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
+ !list_empty(&EXT4_I(inode)->i_orphan);
+}
+
+/*
* Codes for operating systems
*/
#define EXT4_OS_LINUX 0
@@ -3142,6 +3166,8 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block);
+extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
+ sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 42bee1d4f9f9..fa66b08de999 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -663,7 +663,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
{
- blk_opf_t write_flags = REQ_SYNC;
+ blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
/* Add REQ_FUA | REQ_PREFLUSH only its tail */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 93240e35ee36..7a8b30932189 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -354,7 +354,7 @@ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc)
* to cleanup the orphan list in ext4_handle_inode_extension(). Do it
* now.
*/
- if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) {
handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 91185c40f755..22fc333244ef 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -74,7 +74,8 @@ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2)
static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info,
struct ext4_fsmap *rec)
{
- return rec->fmr_physical < info->gfi_low.fmr_physical;
+ return rec->fmr_physical + rec->fmr_length <=
+ info->gfi_low.fmr_physical;
}
/*
@@ -200,15 +201,18 @@ static int ext4_getfsmap_meta_helper(struct super_block *sb,
ext4_group_first_block_no(sb, agno));
fs_end = fs_start + EXT4_C2B(sbi, len);
- /* Return relevant extents from the meta_list */
+ /*
+ * Return relevant extents from the meta_list. We emit all extents that
+ * partially/fully overlap with the query range
+ */
list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
- if (p->fmr_physical < info->gfi_next_fsblk) {
+ if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) {
list_del(&p->fmr_list);
kfree(p);
continue;
}
- if (p->fmr_physical <= fs_start ||
- p->fmr_physical + p->fmr_length <= fs_end) {
+ if (p->fmr_physical <= fs_end &&
+ p->fmr_physical + p->fmr_length > fs_start) {
/* Emit the retained free extent record if present */
if (info->gfi_lastfree.fmr_owner) {
error = ext4_getfsmap_helper(sb, info,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index d45124318200..da76353b3a57 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1025,7 +1025,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
/* Go read the buffer for the next level down */
- bh = ext4_sb_bread(inode->i_sb, nr, 0);
+ bh = ext4_sb_bread_nofail(inode->i_sb, nr);
/*
* A read failure? Report error and clear slot
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5b7a15db4953..f9e4ac87211e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3872,47 +3872,12 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
return ret;
}
-static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
-{
- /* must be a directio to fall back to buffered */
- if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
- (IOMAP_WRITE | IOMAP_DIRECT))
- return false;
-
- /* atomic writes are all-or-nothing */
- if (flags & IOMAP_ATOMIC)
- return false;
-
- /* can only try again if we wrote nothing */
- return written == 0;
-}
-
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
- ssize_t written, unsigned flags, struct iomap *iomap)
-{
- /*
- * Check to see whether an error occurred while writing out the data to
- * the allocated blocks. If so, return the magic error code for
- * non-atomic write so that we fallback to buffered I/O and attempt to
- * complete the remainder of the I/O.
- * For non-atomic writes, any blocks that may have been
- * allocated in preparation for the direct I/O will be reused during
- * buffered I/O. For atomic write, we never fallback to buffered-io.
- */
- if (ext4_want_directio_fallback(flags, written))
- return -ENOTBLK;
-
- return 0;
-}
-
const struct iomap_ops ext4_iomap_ops = {
.iomap_begin = ext4_iomap_begin,
- .iomap_end = ext4_iomap_end,
};
const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_begin = ext4_iomap_overwrite_begin,
- .iomap_end = ext4_iomap_end,
};
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
@@ -4287,7 +4252,11 @@ int ext4_can_truncate(struct inode *inode)
* We have to make sure i_disksize gets properly updated before we truncate
* page cache due to hole punching or zero range. Otherwise i_disksize update
* can get lost as it may have been postponed to submission of writeback but
- * that will never happen after we truncate page cache.
+ * that will never happen if we remove the folio containing i_size from the
+ * page cache. Also if we punch hole within i_size but above i_disksize,
+ * following ext4_page_mkwrite() may mistakenly allocate written blocks over
+ * the hole and thus introduce allocated blocks beyond i_disksize which is
+ * not allowed (e2fsck would complain in case of crash).
*/
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t len)
@@ -4298,9 +4267,11 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t size = i_size_read(inode);
WARN_ON(!inode_is_locked(inode));
- if (offset > size || offset + len < size)
+ if (offset > size)
return 0;
+ if (offset + len < size)
+ size = offset + len;
if (EXT4_I(inode)->i_disksize >= size)
return 0;
@@ -4748,7 +4719,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
* old inodes get re-used with the upper 16 bits of the
* uid/gid intact.
*/
- if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+ if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
} else {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 84e3c73952d7..a93a7baae990 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -27,14 +27,16 @@
#include "fsmap.h"
#include <trace/events/ext4.h>
-typedef void ext4_update_sb_callback(struct ext4_super_block *es,
- const void *arg);
+typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es,
+ const void *arg);
/*
* Superblock modification callback function for changing file system
* label
*/
-static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
+static void ext4_sb_setlabel(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
{
/* Sanity check, this should never happen */
BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX);
@@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
* Superblock modification callback function for changing file system
* UUID.
*/
-static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg)
+static void ext4_sb_setuuid(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
{
memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
}
@@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
goto out_err;
lock_buffer(bh);
- func(es, arg);
+ func(sbi, es, arg);
ext4_superblock_csum_set(sb);
unlock_buffer(bh);
@@ -149,7 +152,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
unlock_buffer(bh);
goto out_bh;
}
- func(es, arg);
+ func(EXT4_SB(sb), es, arg);
if (ext4_has_feature_metadata_csum(sb))
es->s_checksum = ext4_superblock_csum(es);
set_buffer_uptodate(bh);
@@ -1230,6 +1233,295 @@ static int ext4_ioctl_setuuid(struct file *filp,
return ret;
}
+
+#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \
+ EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \
+ EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \
+ EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \
+ EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \
+ EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \
+ EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \
+ EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \
+ EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \
+ EXT4_TUNE_FL_ENCODING_FLAGS)
+
+#define EXT4_TUNE_SET_COMPAT_SUPP \
+ (EXT4_FEATURE_COMPAT_DIR_INDEX | \
+ EXT4_FEATURE_COMPAT_STABLE_INODES)
+#define EXT4_TUNE_SET_INCOMPAT_SUPP \
+ (EXT4_FEATURE_INCOMPAT_EXTENTS | \
+ EXT4_FEATURE_INCOMPAT_EA_INODE | \
+ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR | \
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)
+#define EXT4_TUNE_SET_RO_COMPAT_SUPP \
+ (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \
+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+ EXT4_FEATURE_RO_COMPAT_PROJECT | \
+ EXT4_FEATURE_RO_COMPAT_VERITY)
+
+#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0)
+
+#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \
+ SB_ENC_NO_COMPAT_FALLBACK_FL)
+
+static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi,
+ struct ext4_tune_sb_params __user *params)
+{
+ struct ext4_tune_sb_params ret;
+ struct ext4_super_block *es = sbi->s_es;
+
+ memset(&ret, 0, sizeof(ret));
+ ret.set_flags = TUNE_OPS_SUPPORTED;
+ ret.errors_behavior = le16_to_cpu(es->s_errors);
+ ret.mnt_count = le16_to_cpu(es->s_mnt_count);
+ ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count);
+ ret.checkinterval = le32_to_cpu(es->s_checkinterval);
+ ret.last_check_time = le32_to_cpu(es->s_lastcheck);
+ ret.reserved_blocks = ext4_r_blocks_count(es);
+ ret.blocks_count = ext4_blocks_count(es);
+ ret.reserved_uid = ext4_get_resuid(es);
+ ret.reserved_gid = ext4_get_resgid(es);
+ ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts);
+ ret.def_hash_alg = es->s_def_hash_version;
+ ret.raid_stride = le16_to_cpu(es->s_raid_stride);
+ ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width);
+ ret.encoding = le16_to_cpu(es->s_encoding);
+ ret.encoding_flags = le16_to_cpu(es->s_encoding_flags);
+ strscpy_pad(ret.mount_opts, es->s_mount_opts);
+ ret.feature_compat = le32_to_cpu(es->s_feature_compat);
+ ret.feature_incompat = le32_to_cpu(es->s_feature_incompat);
+ ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat);
+ ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP;
+ ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP;
+ ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP;
+ ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP;
+ ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP;
+ ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP;
+ if (copy_to_user(params, &ret, sizeof(ret)))
+ return -EFAULT;
+ return 0;
+}
+
+static void ext4_sb_setparams(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ const struct ext4_tune_sb_params *params = arg;
+
+ if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR)
+ es->s_errors = cpu_to_le16(params->errors_behavior);
+ if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT)
+ es->s_mnt_count = cpu_to_le16(params->mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT)
+ es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL)
+ es->s_checkinterval = cpu_to_le32(params->checkinterval);
+ if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME)
+ es->s_lastcheck = cpu_to_le32(params->last_check_time);
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) {
+ ext4_fsblk_t blk = params->reserved_blocks;
+
+ es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) {
+ int uid = params->reserved_uid;
+
+ es->s_def_resuid = cpu_to_le16(uid & 0xFFFF);
+ es->s_def_resuid_hi = cpu_to_le16(uid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) {
+ int gid = params->reserved_gid;
+
+ es->s_def_resgid = cpu_to_le16(gid & 0xFFFF);
+ es->s_def_resgid_hi = cpu_to_le16(gid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS)
+ es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts);
+ if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ es->s_def_hash_version = params->def_hash_alg;
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE)
+ es->s_raid_stride = cpu_to_le16(params->raid_stride);
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH)
+ es->s_raid_stripe_width =
+ cpu_to_le32(params->raid_stripe_width);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING)
+ es->s_encoding = cpu_to_le16(params->encoding);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)
+ es->s_encoding_flags = cpu_to_le16(params->encoding_flags);
+ strscpy_pad(es->s_mount_opts, params->mount_opts);
+ if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ es->s_feature_compat |=
+ cpu_to_le32(params->set_feature_compat_mask);
+ es->s_feature_incompat |=
+ cpu_to_le32(params->set_feature_incompat_mask);
+ es->s_feature_ro_compat |=
+ cpu_to_le32(params->set_feature_ro_compat_mask);
+ es->s_feature_compat &=
+ ~cpu_to_le32(params->clear_feature_compat_mask);
+ es->s_feature_incompat &=
+ ~cpu_to_le32(params->clear_feature_incompat_mask);
+ es->s_feature_ro_compat &=
+ ~cpu_to_le32(params->clear_feature_ro_compat_mask);
+ if (params->set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)
+ es->s_def_hash_version = sbi->s_def_hash_version;
+ if (params->set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK)
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+}
+
+static int ext4_ioctl_set_tune_sb(struct file *filp,
+ struct ext4_tune_sb_params __user *in)
+{
+ struct ext4_tune_sb_params params;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int enabling_casefold = 0;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&params, in, sizeof(params)))
+ return -EFAULT;
+
+ if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0)
+ return -EOPNOTSUPP;
+
+ if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) &&
+ (params.errors_behavior > EXT4_ERRORS_PANIC))
+ return -EINVAL;
+
+ if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) &&
+ (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) &&
+ ((params.def_hash_alg > DX_HASH_LAST) ||
+ (params.def_hash_alg == DX_HASH_SIPHASH)))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_FEATURES) &&
+ (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES))
+ return -EINVAL;
+
+ if (params.set_flags & EXT4_TUNE_FL_FEATURES) {
+ params.set_feature_compat_mask =
+ params.feature_compat &
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask =
+ params.feature_incompat &
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask =
+ params.feature_ro_compat &
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ params.clear_feature_compat_mask =
+ ~params.feature_compat &
+ le32_to_cpu(es->s_feature_compat);
+ params.clear_feature_incompat_mask =
+ ~params.feature_incompat &
+ le32_to_cpu(es->s_feature_incompat);
+ params.clear_feature_ro_compat_mask =
+ ~params.feature_ro_compat &
+ le32_to_cpu(es->s_feature_ro_compat);
+ params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ if ((params.set_feature_compat_mask &
+ ~EXT4_TUNE_SET_COMPAT_SUPP) ||
+ (params.set_feature_incompat_mask &
+ ~EXT4_TUNE_SET_INCOMPAT_SUPP) ||
+ (params.set_feature_ro_compat_mask &
+ ~EXT4_TUNE_SET_RO_COMPAT_SUPP) ||
+ (params.clear_feature_compat_mask &
+ ~EXT4_TUNE_CLEAR_COMPAT_SUPP) ||
+ (params.clear_feature_incompat_mask &
+ ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) ||
+ (params.clear_feature_ro_compat_mask &
+ ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP))
+ return -EOPNOTSUPP;
+
+ /*
+ * Filter out the features that are already set from
+ * the set_mask.
+ */
+ params.set_feature_compat_mask &=
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask &=
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask &=
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ if ((params.set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)) {
+ enabling_casefold = 1;
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) {
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING;
+ }
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) {
+ params.encoding_flags = 0;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS;
+ }
+ }
+ if ((params.set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ uuid_t uu;
+
+ memcpy(&uu, sbi->s_hash_seed, UUID_SIZE);
+ if (uuid_is_null(&uu))
+ generate_random_uuid((char *)
+ &sbi->s_hash_seed);
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+ else if (sbi->s_def_hash_version == 0)
+ sbi->s_def_hash_version = DX_HASH_HALF_MD4;
+ if (!(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) &&
+ !(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) {
+#ifdef __CHAR_UNSIGNED__
+ sbi->s_hash_unsigned = 3;
+#else
+ sbi->s_hash_unsigned = 0;
+#endif
+ }
+ }
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding == 0)
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ else if (params.encoding != EXT4_ENC_UTF8_12_1)
+ return -EINVAL;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding_flags & ~SB_ENC_SUPP_MASK)
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, &params);
+ mnt_drop_write_file(filp);
+
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+
+ return ret;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1616,6 +1908,11 @@ resizefs_out:
return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
case EXT4_IOC_SETFSUUID:
return ext4_ioctl_setuuid(filp, (const void __user *)arg);
+ case EXT4_IOC_GET_TUNE_SB_PARAM:
+ return ext4_ioctl_get_tune_sb(EXT4_SB(sb),
+ (void __user *)arg);
+ case EXT4_IOC_SET_TUNE_SB_PARAM:
+ return ext4_ioctl_set_tune_sb(filp, (void __user *)arg);
default:
return -ENOTTY;
}
@@ -1703,7 +2000,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
#endif
-static void set_overhead(struct ext4_super_block *es, const void *arg)
+static void set_overhead(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
{
es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8b18802e83eb..9087183602e4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3655,16 +3655,26 @@ static void ext4_discard_work(struct work_struct *work)
static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
{
+ if (!sbi->s_mb_avg_fragment_size)
+ return;
+
for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
xa_destroy(&sbi->s_mb_avg_fragment_size[i]);
+
kfree(sbi->s_mb_avg_fragment_size);
+ sbi->s_mb_avg_fragment_size = NULL;
}
static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
{
+ if (!sbi->s_mb_largest_free_orders)
+ return;
+
for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
xa_destroy(&sbi->s_mb_largest_free_orders[i]);
+
kfree(sbi->s_mb_largest_free_orders);
+ sbi->s_mb_largest_free_orders = NULL;
}
int ext4_mb_init(struct super_block *sb)
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 51661570cf3b..ab1ff51302fb 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -231,9 +231,9 @@ static int kmmpd(void *data)
* Adjust the mmp_check_interval depending on how much time
* it took for the MMP block to be written.
*/
- mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
- EXT4_MMP_MAX_CHECK_INTERVAL),
- EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MIN_CHECK_INTERVAL,
+ EXT4_MMP_MAX_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index adae3caf175a..4b091c21908f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -225,7 +225,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
do {
if (bh_offset(bh) + blocksize <= from)
continue;
- if (bh_offset(bh) > to)
+ if (bh_offset(bh) >= to)
break;
wait_on_buffer(bh);
if (buffer_uptodate(bh))
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 524d4658fa40..33c3a89396b1 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -109,11 +109,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
- /*
- * Inode orphaned in orphan file or in orphan list?
- */
- if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
- !list_empty(&EXT4_I(inode)->i_orphan))
+ if (ext4_inode_orphan_tracked(inode))
return 0;
/*
@@ -587,9 +583,20 @@ int ext4_init_orphan_info(struct super_block *sb)
ext4_msg(sb, KERN_ERR, "get orphan inode failed");
return PTR_ERR(inode);
}
+ /*
+ * This is just an artificial limit to prevent corrupted fs from
+ * consuming absurd amounts of memory when pinning blocks of orphan
+ * file in memory.
+ */
+ if (inode->i_size > 8 << 20) {
+ ext4_msg(sb, KERN_ERR, "orphan file too big: %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto out_put;
+ }
oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
- oi->of_binfo = kmalloc_array(oi->of_blocks,
+ oi->of_binfo = kvmalloc_array(oi->of_blocks,
sizeof(struct ext4_orphan_block),
GFP_KERNEL);
if (!oi->of_binfo) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f2d4014d128..33e7c08c9529 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -265,6 +265,15 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}
+struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
+ sector_t block)
+{
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
+ ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL;
+
+ return __ext4_sb_bread_gfp(sb, block, 0, gfp);
+}
+
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
@@ -1438,9 +1447,9 @@ static void ext4_free_in_core_inode(struct inode *inode)
static void ext4_destroy_inode(struct inode *inode)
{
- if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
+ if (ext4_inode_orphan_tracked(inode)) {
ext4_msg(inode->i_sb, KERN_ERR,
- "Inode %lu (%p): orphan list check failed!",
+ "Inode %lu (%p): inode tracked as orphan!",
inode->i_ino, EXT4_I(inode));
print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
EXT4_I(inode), sizeof(struct ext4_inode_info),
@@ -2466,7 +2475,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
struct ext4_fs_context *m_ctx)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *s_mount_opts = NULL;
+ char s_mount_opts[65];
struct ext4_fs_context *s_ctx = NULL;
struct fs_context *fc = NULL;
int ret = -ENOMEM;
@@ -2474,15 +2483,11 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
if (!sbi->s_es->s_mount_opts[0])
return 0;
- s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
- sizeof(sbi->s_es->s_mount_opts),
- GFP_KERNEL);
- if (!s_mount_opts)
- return ret;
+ strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts);
fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
if (!fc)
- goto out_free;
+ return -ENOMEM;
s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
if (!s_ctx)
@@ -2514,11 +2519,8 @@ parse_failed:
ret = 0;
out_free:
- if (fc) {
- ext4_fc_free(fc);
- kfree(fc);
- }
- kfree(s_mount_opts);
+ ext4_fc_free(fc);
+ kfree(fc);
return ret;
}
@@ -2964,11 +2966,11 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
}
if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
- le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+ ext4_get_resuid(es) != EXT4_DEF_RESUID)
SEQ_OPTS_PRINT("resuid=%u",
from_kuid_munged(&init_user_ns, sbi->s_resuid));
if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
- le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+ ext4_get_resgid(es) != EXT4_DEF_RESGID)
SEQ_OPTS_PRINT("resgid=%u",
from_kgid_munged(&init_user_ns, sbi->s_resgid));
def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
@@ -5283,8 +5285,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_set_def_opts(sb, es);
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es));
+ sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es));
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5a6fe1513fd2..ce7253b3f549 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -251,6 +251,10 @@ check_xattrs(struct inode *inode, struct buffer_head *bh,
err_str = "invalid ea_ino";
goto errout;
}
+ if (ea_ino && !size) {
+ err_str = "invalid size in ea xattr";
+ goto errout;
+ }
if (size > EXT4_XATTR_SIZE_MAX) {
err_str = "e_value size too large";
goto errout;
@@ -1019,7 +1023,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
struct ext4_iloc iloc;
- s64 ref_count;
+ u64 ref_count;
int ret;
inode_lock_nested(ea_inode, I_MUTEX_XATTR);
@@ -1029,13 +1033,17 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
goto out;
ref_count = ext4_xattr_inode_get_ref(ea_inode);
+ if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) {
+ ext4_error_inode(ea_inode, __func__, __LINE__, 0,
+ "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d",
+ ea_inode->i_ino, ref_count, ref_change);
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
ref_count += ref_change;
ext4_xattr_inode_set_ref(ea_inode, ref_count);
if (ref_change > 0) {
- WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
- ea_inode->i_ino, ref_count);
-
if (ref_count == 1) {
WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
ea_inode->i_ino, ea_inode->i_nlink);
@@ -1044,9 +1052,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
ext4_orphan_del(handle, ea_inode);
}
} else {
- WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
- ea_inode->i_ino, ref_count);
-
if (ref_count == 0) {
WARN_ONCE(ea_inode->i_nlink != 1,
"EA inode %lu i_nlink=%u",
@@ -1530,7 +1535,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
!(current->flags & PF_MEMALLOC_NOFS));
- ea_data = kvmalloc(value_len, GFP_KERNEL);
+ ea_data = kvmalloc(value_len, GFP_NOFS);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
return NULL;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index db3831f7f2f5..bbe07e3a6c75 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1442,6 +1442,34 @@ u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
return get_sectors_written(sbi->sb->s_bdev);
}
+static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type)
+{
+ cpc->stats.times[type] = ktime_get();
+}
+
+static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+ unsigned long long sb_diff, cur_diff;
+ enum cp_time ct;
+
+ sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END],
+ sbi->cp_stats.times[CP_TIME_START]);
+ cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END],
+ cpc->stats.times[CP_TIME_START]);
+
+ if (cur_diff > sb_diff) {
+ sbi->cp_stats = cpc->stats;
+ if (cur_diff < CP_LONG_LATENCY_THRESHOLD)
+ return;
+
+ f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff);
+ for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++)
+ f2fs_warn(sbi, "Step#%d: %llu ms", ct,
+ (u64)ktime_ms_delta(cpc->stats.times[ct + 1],
+ cpc->stats.times[ct]));
+ }
+}
+
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1459,6 +1487,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT/SIT pages */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ stat_cp_time(cpc, CP_TIME_SYNC_META);
+
/* start to update checkpoint, cp ver is already updated previously */
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
@@ -1555,20 +1585,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ stat_cp_time(cpc, CP_TIME_SYNC_CP_META);
+
/* Wait for all dirty meta pages to be submitted for IO */
f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
+ stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META);
/* wait for previous submitted meta pages writeback */
f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
+ stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA);
/* flush all device cache */
err = f2fs_flush_device_cache(sbi);
if (err)
return err;
+ stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE);
/* barrier and flush checkpoint cp pack 2 page if it can */
commit_checkpoint(sbi, ckpt, start_blk);
f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
+ stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP);
/*
* invalidate intermediate page cache borrowed from meta inode which are
@@ -1613,6 +1649,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned long long ckpt_ver;
int err = 0;
+ stat_cp_time(cpc, CP_TIME_START);
+
if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi))
return -EROFS;
@@ -1624,6 +1662,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason != CP_RESIZE)
f2fs_down_write(&sbi->cp_global_sem);
+ stat_cp_time(cpc, CP_TIME_LOCK);
+
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
@@ -1639,6 +1679,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (err)
goto out;
+ stat_cp_time(cpc, CP_TIME_OP_LOCK);
+
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
f2fs_flush_merged_writes(sbi);
@@ -1678,6 +1720,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_sit_entries(sbi, cpc);
+ stat_cp_time(cpc, CP_TIME_FLUSH_META);
+
/* save inmem log status */
f2fs_save_inmem_curseg(sbi);
@@ -1695,6 +1739,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
stat_inc_cp_count(sbi);
stop:
unblock_operations(sbi);
+ stat_cp_time(cpc, CP_TIME_END);
+ check_cp_time(sbi, cpc);
if (cpc->reason & CP_RECOVERY)
f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
@@ -1778,6 +1824,7 @@ static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
req->ret = ret;
+ req->delta_time = diff;
complete(&req->wait);
sum_diff += diff;
@@ -1873,6 +1920,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
else
flush_remained_ckpt_reqs(sbi, &req);
+ if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) {
+ f2fs_warn_ratelimited(sbi,
+ "blocked on checkpoint for %u ms", cprc->peak_time);
+ dump_stack();
+ }
+
return req.ret;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 5c1f47e45dab..6ad8d3bc6df7 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1215,9 +1215,11 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock)
{
void *fsdata = NULL;
struct page *pagep;
+ struct page **rpages;
int log_cluster_size = F2FS_I(inode)->i_log_cluster_size;
pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) <<
log_cluster_size;
+ int i;
int err;
err = f2fs_is_compressed_cluster(inode, start_idx);
@@ -1238,27 +1240,30 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock)
if (err <= 0)
return err;
- if (err > 0) {
- struct page **rpages = fsdata;
- int cluster_size = F2FS_I(inode)->i_cluster_size;
- int i;
-
- for (i = cluster_size - 1; i >= 0; i--) {
- struct folio *folio = page_folio(rpages[i]);
- loff_t start = folio->index << PAGE_SHIFT;
-
- if (from <= start) {
- folio_zero_segment(folio, 0, folio_size(folio));
- } else {
- folio_zero_segment(folio, from - start,
- folio_size(folio));
- break;
- }
- }
+ rpages = fsdata;
+
+ for (i = (1 << log_cluster_size) - 1; i >= 0; i--) {
+ struct folio *folio = page_folio(rpages[i]);
+ loff_t start = (loff_t)folio->index << PAGE_SHIFT;
+ loff_t offset = from > start ? from - start : 0;
- f2fs_compress_write_end(inode, fsdata, start_idx, true);
+ folio_zero_segment(folio, offset, folio_size(folio));
+
+ if (from >= start)
+ break;
}
- return 0;
+
+ f2fs_compress_write_end(inode, fsdata, start_idx, true);
+
+ err = filemap_write_and_wait_range(inode->i_mapping,
+ round_down(from, 1 << log_cluster_size << PAGE_SHIFT),
+ LLONG_MAX);
+ if (err)
+ return err;
+
+ truncate_pagecache(inode, from);
+
+ return f2fs_do_truncate_blocks(inode, round_up(from, PAGE_SIZE), lock);
}
static int f2fs_write_compressed_pages(struct compress_ctx *cc,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7961e0ddfca3..ef38e62cda8f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -733,9 +733,11 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
static bool io_type_is_mergeable(struct f2fs_bio_info *io,
struct f2fs_io_info *fio)
{
+ blk_opf_t mask = ~(REQ_PREFLUSH | REQ_FUA);
+
if (io->fio.op != fio->op)
return false;
- return io->fio.op_flags == fio->op_flags;
+ return (io->fio.op_flags & mask) == (fio->op_flags & mask);
}
static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
@@ -911,7 +913,7 @@ alloc_new:
if (fio->io_wbc)
wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio));
- inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false));
+ inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false));
*fio->last_block = fio->new_blkaddr;
*fio->bio = bio;
@@ -1083,7 +1085,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
}
/* This can handle encryption stuffs */
-static int f2fs_submit_page_read(struct inode *inode, struct folio *folio,
+static void f2fs_submit_page_read(struct inode *inode, struct folio *folio,
block_t blkaddr, blk_opf_t op_flags,
bool for_write)
{
@@ -1092,23 +1094,16 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio,
bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
folio->index, for_write);
- if (IS_ERR(bio))
- return PTR_ERR(bio);
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, blkaddr);
- if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) {
- iostat_update_and_unbind_ctx(bio);
- if (bio->bi_private)
- mempool_free(bio->bi_private, bio_post_read_ctx_pool);
- bio_put(bio);
- return -EFAULT;
- }
+ if (!bio_add_folio(bio, folio, PAGE_SIZE, 0))
+ f2fs_bug_on(sbi, 1);
+
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
f2fs_submit_read_bio(sbi, bio, DATA);
- return 0;
}
static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
@@ -1265,10 +1260,8 @@ got_it:
return folio;
}
- err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr,
+ f2fs_submit_page_read(inode, folio, dn.data_blkaddr,
op_flags, for_write);
- if (err)
- goto put_err;
return folio;
put_err:
@@ -1572,6 +1565,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
pgofs = (pgoff_t)map->m_lblk;
end = pgofs + maxblocks;
+ if (flag == F2FS_GET_BLOCK_PRECACHE)
+ mode = LOOKUP_NODE_RA;
+
next_dnode:
if (map->m_may_create) {
if (f2fs_lfs_mode(sbi))
@@ -1778,12 +1774,13 @@ sync_out:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_read_extent_cache_range(&dn,
- start_pgofs, map->m_pblk + ofs,
- map->m_len - ofs);
+ if (map->m_len > ofs)
+ f2fs_update_read_extent_cache_range(&dn,
+ start_pgofs, map->m_pblk + ofs,
+ map->m_len - ofs);
}
if (map->m_next_extent)
- *map->m_next_extent = pgofs + 1;
+ *map->m_next_extent = is_hole ? pgofs + 1 : pgofs;
}
f2fs_put_dnode(&dn);
unlock_out:
@@ -2145,16 +2142,10 @@ submit_and_realloc:
f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
- if (bio == NULL) {
+ if (bio == NULL)
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
f2fs_ra_op_flags(rac), index,
false);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- bio = NULL;
- goto out;
- }
- }
/*
* If the page is under writeback, we need to wait for
@@ -2303,18 +2294,10 @@ submit_and_realloc:
bio = NULL;
}
- if (!bio) {
+ if (!bio)
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i,
f2fs_ra_op_flags(rac),
folio->index, for_write);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- f2fs_decompress_end_io(dic, ret, true);
- f2fs_put_dnode(&dn);
- *bio_ret = NULL;
- return ret;
- }
- }
if (!bio_add_folio(bio, folio, blocksize, 0))
goto submit_and_realloc;
@@ -3639,11 +3622,9 @@ repeat:
err = -EFSCORRUPTED;
goto put_folio;
}
- err = f2fs_submit_page_read(use_cow ?
+ f2fs_submit_page_read(use_cow ?
F2FS_I(inode)->cow_inode : inode,
folio, blkaddr, 0, true);
- if (err)
- goto put_folio;
folio_lock(folio);
if (unlikely(folio->mapping != mapping)) {
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index fffd7749d6d1..48f4f98afb01 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -16,6 +16,21 @@
#include "xattr.h"
#include <trace/events/f2fs.h>
+static inline bool f2fs_should_fallback_to_linear(struct inode *dir)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+ switch (F2FS_OPTION(sbi).lookup_mode) {
+ case LOOKUP_PERF:
+ return false;
+ case LOOKUP_COMPAT:
+ return true;
+ case LOOKUP_AUTO:
+ return !sb_no_casefold_compat_fallback(sbi->sb);
+ }
+ return false;
+}
+
#if IS_ENABLED(CONFIG_UNICODE)
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
@@ -366,7 +381,7 @@ start_find_entry:
out:
#if IS_ENABLED(CONFIG_UNICODE)
- if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
+ if (f2fs_should_fallback_to_linear(dir) &&
IS_CASEFOLDED(dir) && !de && use_hash) {
use_hash = false;
goto start_find_entry;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 199c1e7a83ef..33e09c453c70 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -604,7 +604,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
p = &(*p)->rb_right;
leftmost = false;
} else {
+ f2fs_err_ratelimited(sbi, "%s: corrupted extent, type: %d, "
+ "extent node in rb tree [%u, %u, %u], age [%llu, %llu], "
+ "extent node to insert [%u, %u, %u], age [%llu, %llu]",
+ __func__, et->type, en->ei.fofs, en->ei.blk, en->ei.len, en->ei.age,
+ en->ei.last_blocks, ei->fofs, ei->blk, ei->len, ei->age, ei->last_blocks);
f2fs_bug_on(sbi, 1);
+ return NULL;
}
}
@@ -664,6 +670,15 @@ static void __update_extent_tree_range(struct inode *inode,
if (!et)
return;
+ if (unlikely(len == 0)) {
+ f2fs_err_ratelimited(sbi, "%s: extent len is zero, type: %d, "
+ "extent [%u, %u, %u], age [%llu, %llu]",
+ __func__, type, tei->fofs, tei->blk, tei->len,
+ tei->age, tei->last_blocks);
+ f2fs_bug_on(sbi, 1);
+ return;
+ }
+
if (type == EX_READ)
trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
tei->blk, 0);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6e465bbc85ee..5b4e9548a231 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -131,6 +131,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
* string rather than using the MS_LAZYTIME flag, so this must remain.
*/
#define F2FS_MOUNT_LAZYTIME 0x40000000
+#define F2FS_MOUNT_RESERVE_NODE 0x80000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -155,6 +156,18 @@ enum blkzone_allocation_policy {
BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */
};
+enum bggc_io_aware_policy {
+ AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */
+ AWARE_READ_IO, /* skip background GC if there is pending read IO */
+ AWARE_NONE, /* don't aware IO for background GC */
+};
+
+enum device_allocation_policy {
+ ALLOCATE_FORWARD_NOHINT,
+ ALLOCATE_FORWARD_WITHIN_HINT,
+ ALLOCATE_FORWARD_FROM_HINT,
+};
+
/*
* An implementation of an rwsem that is explicitly unfair to readers. This
* prevents priority inversion when a low-priority reader acquires the read lock
@@ -172,6 +185,7 @@ struct f2fs_rwsem {
struct f2fs_mount_info {
unsigned int opt;
block_t root_reserved_blocks; /* root reserved blocks */
+ block_t root_reserved_nodes; /* root reserved nodes */
kuid_t s_resuid; /* reserved blocks for uid */
kgid_t s_resgid; /* reserved blocks for gid */
int active_logs; /* # of active logs */
@@ -212,6 +226,7 @@ struct f2fs_mount_info {
int compress_mode; /* compression mode */
unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
+ unsigned int lookup_mode;
};
#define F2FS_FEATURE_ENCRYPT 0x00000001
@@ -266,14 +281,36 @@ enum {
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
+#define DEF_ENABLE_INTERVAL 16 /* 16 secs */
#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */
#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */
+enum cp_time {
+ CP_TIME_START, /* begin */
+ CP_TIME_LOCK, /* after cp_global_sem */
+ CP_TIME_OP_LOCK, /* after block_operation */
+ CP_TIME_FLUSH_META, /* after flush sit/nat */
+ CP_TIME_SYNC_META, /* after sync_meta_pages */
+ CP_TIME_SYNC_CP_META, /* after sync cp meta pages */
+ CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */
+ CP_TIME_WAIT_CP_DATA, /* after wait on cp data */
+ CP_TIME_FLUSH_DEVICE, /* after flush device cache */
+ CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */
+ CP_TIME_END, /* after unblock_operation */
+ CP_TIME_MAX,
+};
+
+/* time cost stats of checkpoint */
+struct cp_stats {
+ ktime_t times[CP_TIME_MAX];
+};
+
struct cp_control {
int reason;
__u64 trim_start;
__u64 trim_end;
__u64 trim_minlen;
+ struct cp_stats stats;
};
/*
@@ -334,7 +371,10 @@ struct ckpt_req {
struct completion wait; /* completion for checkpoint done */
struct llist_node llnode; /* llist_node to be linked in wait queue */
int ret; /* return code of checkpoint */
- ktime_t queue_time; /* request queued time */
+ union {
+ ktime_t queue_time; /* request queued time */
+ ktime_t delta_time; /* time in queue */
+ };
};
struct ckpt_req_control {
@@ -350,6 +390,9 @@ struct ckpt_req_control {
unsigned int peak_time; /* peak wait time in msec until now */
};
+/* a time threshold that checkpoint was blocked for, unit: ms */
+#define CP_LONG_LATENCY_THRESHOLD 5000
+
/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
@@ -1375,6 +1418,7 @@ enum {
DISCARD_TIME,
GC_TIME,
DISABLE_TIME,
+ ENABLE_TIME,
UMOUNT_DISCARD_TIMEOUT,
MAX_TIME,
};
@@ -1454,6 +1498,12 @@ enum {
TOTAL_CALL = FOREGROUND,
};
+enum f2fs_lookup_mode {
+ LOOKUP_PERF,
+ LOOKUP_COMPAT,
+ LOOKUP_AUTO,
+};
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1643,6 +1693,7 @@ struct f2fs_sb_info {
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
struct ckpt_req_control cprc_info; /* for checkpoint request control */
+ struct cp_stats cp_stats; /* for time stat of checkpoint */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -1810,6 +1861,9 @@ struct f2fs_sb_info {
spinlock_t dev_lock; /* protect dirty_device */
bool aligned_blksize; /* all devices has the same logical blksize */
unsigned int first_seq_zone_segno; /* first segno in sequential zone */
+ unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */
+ unsigned int allocate_section_hint; /* the boundary position between devices */
+ unsigned int allocate_section_policy; /* determine the section writing priority */
/* For write statistics */
u64 sectors_written_start;
@@ -2362,13 +2416,11 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
return ofs == XATTR_NODE_OFFSET;
}
-static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
+static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi,
struct inode *inode, bool cap)
{
if (!inode)
return true;
- if (!test_opt(sbi, RESERVE_ROOT))
- return false;
if (IS_NOQUOTA(inode))
return true;
if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid()))
@@ -2389,7 +2441,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi,
avail_user_block_count = sbi->user_block_count -
sbi->current_reserved_blocks;
- if (!__allow_reserved_blocks(sbi, inode, cap))
+ if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap))
avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
@@ -2747,7 +2799,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
{
block_t valid_block_count;
- unsigned int valid_node_count;
+ unsigned int valid_node_count, avail_user_node_count;
unsigned int avail_user_block_count;
int err;
@@ -2769,15 +2821,20 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
valid_block_count = sbi->total_valid_block_count + 1;
- avail_user_block_count = get_available_block_count(sbi, inode, false);
+ avail_user_block_count = get_available_block_count(sbi, inode,
+ test_opt(sbi, RESERVE_NODE));
if (unlikely(valid_block_count > avail_user_block_count)) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
+ avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+ if (test_opt(sbi, RESERVE_NODE) &&
+ !__allow_reserved_root(sbi, inode, true))
+ avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes;
valid_node_count = sbi->total_valid_node_count + 1;
- if (unlikely(valid_node_count > sbi->total_node_count)) {
+ if (unlikely(valid_node_count > avail_user_node_count)) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
@@ -3004,13 +3061,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
if (sbi->gc_mode == GC_URGENT_HIGH)
return true;
- if (zoned_gc) {
- if (is_inflight_read_io(sbi))
- return false;
- } else {
- if (is_inflight_io(sbi, type))
- return false;
- }
+ if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi))
+ return false;
+ if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type))
+ return false;
if (sbi->gc_mode == GC_URGENT_MID)
return true;
@@ -3770,6 +3824,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname);
* node.c
*/
struct node_info;
+enum node_type;
int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
@@ -3792,7 +3847,8 @@ int f2fs_remove_inode_page(struct inode *inode);
struct folio *f2fs_new_inode_folio(struct inode *inode);
struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs);
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
-struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid);
+struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
+ enum node_type node_type);
struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino);
struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid);
int f2fs_move_node_folio(struct folio *node_folio, int gc_type);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 42faaed6a02d..ffa045b39c01 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -35,15 +35,23 @@
#include <trace/events/f2fs.h>
#include <uapi/linux/f2fs.h>
-static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size)
+static void f2fs_zero_post_eof_page(struct inode *inode,
+ loff_t new_size, bool lock)
{
loff_t old_size = i_size_read(inode);
if (old_size >= new_size)
return;
+ if (mapping_empty(inode->i_mapping))
+ return;
+
+ if (lock)
+ filemap_invalidate_lock(inode->i_mapping);
/* zero or drop pages only in range of [old_size, new_size] */
- truncate_pagecache(inode, old_size);
+ truncate_inode_pages_range(inode->i_mapping, old_size, new_size);
+ if (lock)
+ filemap_invalidate_unlock(inode->i_mapping);
}
static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
@@ -114,9 +122,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT);
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true);
file_update_time(vmf->vma->vm_file);
filemap_invalidate_lock_shared(inode->i_mapping);
@@ -904,8 +910,16 @@ int f2fs_truncate(struct inode *inode)
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
- if (err)
+ if (err) {
+ /*
+ * Always truncate page #0 to avoid page cache
+ * leak in evict() path.
+ */
+ truncate_inode_pages_range(inode->i_mapping,
+ F2FS_BLK_TO_BYTES(0),
+ F2FS_BLK_END_BYTES(0));
return err;
+ }
}
err = f2fs_truncate_blocks(inode, i_size_read(inode), true);
@@ -1141,7 +1155,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
filemap_invalidate_lock(inode->i_mapping);
if (attr->ia_size > old_size)
- f2fs_zero_post_eof_page(inode, attr->ia_size);
+ f2fs_zero_post_eof_page(inode, attr->ia_size, false);
truncate_setsize(inode, attr->ia_size);
if (attr->ia_size <= old_size)
@@ -1260,9 +1274,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len, true);
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1547,7 +1559,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
+ f2fs_zero_post_eof_page(inode, offset + len, false);
f2fs_lock_op(sbi);
f2fs_drop_extent_tree(inode);
@@ -1670,9 +1682,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
- filemap_invalidate_lock(mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
- filemap_invalidate_unlock(mapping);
+ f2fs_zero_post_eof_page(inode, offset + len, true);
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1806,7 +1816,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
+ f2fs_zero_post_eof_page(inode, offset + len, false);
truncate_pagecache(inode, offset);
while (!ret && idx > pg_start) {
@@ -1864,9 +1874,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
if (err)
return err;
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len, true);
f2fs_balance_fs(sbi, true);
@@ -4914,9 +4922,8 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from)
if (err)
return err;
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from));
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode,
+ iocb->ki_pos + iov_iter_count(from), true);
return count;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 098e9f71421e..a7708cf80c04 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1071,7 +1071,7 @@ next_step:
}
/* phase == 2 */
- node_folio = f2fs_get_node_folio(sbi, nid);
+ node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR);
if (IS_ERR(node_folio))
continue;
@@ -1145,7 +1145,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
nid = le32_to_cpu(sum->nid);
ofs_in_node = le16_to_cpu(sum->ofs_in_node);
- node_folio = f2fs_get_node_folio(sbi, nid);
+ node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR);
if (IS_ERR(node_folio))
return false;
@@ -1794,6 +1794,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) {
+ f2fs_err(sbi, "%s: segment %u is used by log",
+ __func__, segno);
+ f2fs_bug_on(sbi, 1);
+ goto skip;
+ }
+
if (get_valid_blocks(sbi, segno, false) == 0)
goto freed;
if (gc_type == BG_GC && __is_large_section(sbi) &&
@@ -1805,7 +1812,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
sum = folio_address(sum_folio);
if (type != GET_SUM_TYPE((&sum->footer))) {
- f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
+ f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA",
segno, type, GET_SUM_TYPE((&sum->footer)));
f2fs_stop_checkpoint(sbi, false,
STOP_CP_REASON_CORRUPTED_SUMMARY);
@@ -2068,6 +2075,13 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
+ /*
+ * avoid migrating empty section, as it can be allocated by
+ * log in parallel.
+ */
+ if (!get_valid_blocks(sbi, segno, true))
+ continue;
+
if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno)))
continue;
@@ -2182,6 +2196,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs;
MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
MAIN_SECS(sbi) += secs;
+ if (sbi->allocate_section_hint > MAIN_SECS(sbi))
+ sbi->allocate_section_hint = MAIN_SECS(sbi);
FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
@@ -2189,6 +2205,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
if (f2fs_is_multi_device(sbi)) {
int last_dev = sbi->s_ndevs - 1;
+ sbi->allocate_section_hint = FDEV(0).total_segments /
+ SEGS_PER_SEC(sbi);
+
FDEV(last_dev).total_segments =
(int)FDEV(last_dev).total_segments + segs;
FDEV(last_dev).end_blk =
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 27743b93e186..482a362f2625 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab;
static struct kmem_cache *nat_entry_set_slab;
static struct kmem_cache *fsync_node_entry_slab;
+static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid;
+}
+
/*
* Check whether the given nid is within node id range.
*/
int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
{
- if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) {
+ if (unlikely(is_invalid_nid(sbi, nid))) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.",
__func__, nid);
@@ -871,7 +876,8 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
if (!done) {
- nfolio[i] = f2fs_get_node_folio(sbi, nids[i]);
+ nfolio[i] = f2fs_get_node_folio(sbi, nids[i],
+ NODE_TYPE_NON_INODE);
if (IS_ERR(nfolio[i])) {
err = PTR_ERR(nfolio[i]);
f2fs_folio_put(nfolio[0], false);
@@ -989,7 +995,7 @@ static int truncate_dnode(struct dnode_of_data *dn)
return 1;
/* get direct node */
- folio = f2fs_get_node_folio(sbi, dn->nid);
+ folio = f2fs_get_node_folio(sbi, dn->nid, NODE_TYPE_NON_INODE);
if (PTR_ERR(folio) == -ENOENT)
return 1;
else if (IS_ERR(folio))
@@ -1033,7 +1039,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
- folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid);
+ folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid,
+ NODE_TYPE_NON_INODE);
if (IS_ERR(folio)) {
trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio));
return PTR_ERR(folio);
@@ -1111,7 +1118,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
/* get indirect nodes in the path */
for (i = 0; i < idx + 1; i++) {
/* reference count'll be increased */
- folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]);
+ folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i],
+ NODE_TYPE_NON_INODE);
if (IS_ERR(folios[i])) {
err = PTR_ERR(folios[i]);
idx = i - 1;
@@ -1496,21 +1504,37 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
struct folio *folio, pgoff_t nid,
enum node_type ntype)
{
- if (unlikely(nid != nid_of_node(folio) ||
- (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) ||
- (ntype == NODE_TYPE_XATTR &&
- !f2fs_has_xattr_block(ofs_of_node(folio))) ||
- time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) {
- f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
- "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
- ntype, nid, nid_of_node(folio), ino_of_node(folio),
- ofs_of_node(folio), cpver_of_node(folio),
- next_blkaddr_of_node(folio));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
- return -EFSCORRUPTED;
+ if (unlikely(nid != nid_of_node(folio)))
+ goto out_err;
+
+ switch (ntype) {
+ case NODE_TYPE_INODE:
+ if (!IS_INODE(folio))
+ goto out_err;
+ break;
+ case NODE_TYPE_XATTR:
+ if (!f2fs_has_xattr_block(ofs_of_node(folio)))
+ goto out_err;
+ break;
+ case NODE_TYPE_NON_INODE:
+ if (IS_INODE(folio))
+ goto out_err;
+ break;
+ default:
+ break;
}
+ if (time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))
+ goto out_err;
return 0;
+out_err:
+ f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
+ "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ ntype, nid, nid_of_node(folio), ino_of_node(folio),
+ ofs_of_node(folio), cpver_of_node(folio),
+ next_blkaddr_of_node(folio));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
+ return -EFSCORRUPTED;
}
static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
@@ -1546,7 +1570,7 @@ repeat:
if (unlikely(!folio_test_uptodate(folio))) {
err = -EIO;
- goto out_err;
+ goto out_put_err;
}
if (!f2fs_inode_chksum_verify(sbi, folio)) {
@@ -1567,9 +1591,10 @@ out_put_err:
return ERR_PTR(err);
}
-struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid)
+struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
+ enum node_type node_type)
{
- return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR);
+ return __get_node_folio(sbi, nid, NULL, 0, node_type);
}
struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino)
@@ -2634,6 +2659,16 @@ retry:
f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
i = list_first_entry(&nm_i->free_nid_list,
struct free_nid, list);
+
+ if (unlikely(is_invalid_nid(sbi, i->nid))) {
+ spin_unlock(&nm_i->nid_list_lock);
+ f2fs_err(sbi, "Corrupted nid %u in free_nid_list",
+ i->nid);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_CORRUPTED_NID);
+ return false;
+ }
+
*nid = i->nid;
__move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 030390543b54..9cb8dcf8d417 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -57,6 +57,7 @@ enum node_type {
NODE_TYPE_REGULAR,
NODE_TYPE_INODE,
NODE_TYPE_XATTR,
+ NODE_TYPE_NON_INODE,
};
/*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 4cb3a91801b4..215e442db72c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -548,7 +548,7 @@ got_it:
}
/* Get the node page */
- node_folio = f2fs_get_node_folio(sbi, nid);
+ node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR);
if (IS_ERR(node_folio))
return PTR_ERR(node_folio);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index cc82d42ef14c..b45eace879d7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2774,6 +2774,8 @@ static int get_new_segment(struct f2fs_sb_info *sbi,
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
+ unsigned int alloc_policy = sbi->allocate_section_policy;
+ unsigned int alloc_hint = sbi->allocate_section_hint;
bool init = true;
int i;
int ret = 0;
@@ -2807,6 +2809,21 @@ static int get_new_segment(struct f2fs_sb_info *sbi,
}
#endif
+ /*
+ * Prevent allocate_section_hint from exceeding MAIN_SECS()
+ * due to desynchronization.
+ */
+ if (alloc_policy != ALLOCATE_FORWARD_NOHINT &&
+ alloc_hint > MAIN_SECS(sbi))
+ alloc_hint = MAIN_SECS(sbi);
+
+ if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT &&
+ hint < alloc_hint)
+ hint = alloc_hint;
+ else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT &&
+ hint >= alloc_hint)
+ hint = 0;
+
find_other_zone:
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
@@ -3672,7 +3689,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
- f2fs_is_cow_file(inode))
+ f2fs_is_cow_file(inode) ||
+ is_inode_flag_set(inode, FI_NEED_IPU))
return CURSEG_HOT_DATA;
return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
@@ -3936,12 +3954,18 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
int seg_type = log_type_to_seg_type(type);
bool keep_order = (f2fs_lfs_mode(fio->sbi) &&
seg_type == CURSEG_COLD_DATA);
+ int err;
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
- if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr,
- &fio->new_blkaddr, sum, type, fio)) {
+ err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type, fio);
+ if (unlikely(err)) {
+ f2fs_err_ratelimited(fio->sbi,
+ "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d",
+ __func__, fio->ino, folio->index, type,
+ fio->old_blkaddr, fio->new_blkaddr, err);
if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
folio_end_writeback(folio);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5e2ee5c686b1..1ce2c8abaf48 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -600,6 +600,16 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
}
+static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi,
+ enum log_type type, unsigned int segno)
+{
+ if (f2fs_lfs_mode(sbi) && __is_large_section(sbi))
+ return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi,
+ (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
+ CURSEG_I(sbi, type)->next_blkoff;
+ return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true);
+}
+
static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
unsigned int node_blocks, unsigned int data_blocks,
unsigned int dent_blocks)
@@ -614,14 +624,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
if (unlikely(segno == NULL_SEGNO))
return false;
- if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
- CURSEG_I(sbi, i)->next_blkoff;
- } else {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- get_ckpt_valid_blocks(sbi, segno, true);
- }
+ left_blocks = get_left_section_blocks(sbi, i, segno);
blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks;
if (blocks > left_blocks)
@@ -634,14 +637,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
if (unlikely(segno == NULL_SEGNO))
return false;
- if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
- CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff;
- } else {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- get_ckpt_valid_blocks(sbi, segno, true);
- }
+ left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno);
if (dent_blocks > left_blocks)
return false;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2619cbbd7d2d..fd8e7b0b2166 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -143,6 +143,7 @@ enum {
Opt_extent_cache,
Opt_data_flush,
Opt_reserve_root,
+ Opt_reserve_node,
Opt_resgid,
Opt_resuid,
Opt_mode,
@@ -181,6 +182,7 @@ enum {
Opt_nat_bits,
Opt_jqfmt,
Opt_checkpoint,
+ Opt_lookup_mode,
Opt_err,
};
@@ -244,6 +246,13 @@ static const struct constant_table f2fs_param_errors[] = {
{}
};
+static const struct constant_table f2fs_param_lookup_mode[] = {
+ {"perf", LOOKUP_PERF},
+ {"compat", LOOKUP_COMPAT},
+ {"auto", LOOKUP_AUTO},
+ {}
+};
+
static const struct fs_parameter_spec f2fs_param_specs[] = {
fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc),
fsparam_flag("disable_roll_forward", Opt_disable_roll_forward),
@@ -265,6 +274,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = {
fsparam_flag_no("extent_cache", Opt_extent_cache),
fsparam_flag("data_flush", Opt_data_flush),
fsparam_u32("reserve_root", Opt_reserve_root),
+ fsparam_u32("reserve_node", Opt_reserve_node),
fsparam_gid("resgid", Opt_resgid),
fsparam_uid("resuid", Opt_resuid),
fsparam_enum("mode", Opt_mode, f2fs_param_mode),
@@ -300,6 +310,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = {
fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode),
fsparam_flag("age_extent_cache", Opt_age_extent_cache),
fsparam_enum("errors", Opt_errors, f2fs_param_errors),
+ fsparam_enum("lookup_mode", Opt_lookup_mode, f2fs_param_lookup_mode),
{}
};
@@ -336,6 +347,8 @@ static match_table_t f2fs_checkpoint_tokens = {
#define F2FS_SPEC_discard_unit (1 << 21)
#define F2FS_SPEC_memory_mode (1 << 22)
#define F2FS_SPEC_errors (1 << 23)
+#define F2FS_SPEC_lookup_mode (1 << 24)
+#define F2FS_SPEC_reserve_node (1 << 25)
struct f2fs_fs_context {
struct f2fs_mount_info info;
@@ -437,22 +450,30 @@ static void f2fs_destroy_casefold_cache(void) { }
static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
{
- block_t limit = min((sbi->user_block_count >> 3),
+ block_t block_limit = min((sbi->user_block_count >> 3),
sbi->user_block_count - sbi->reserved_blocks);
+ block_t node_limit = sbi->total_node_count >> 3;
/* limit is 12.5% */
if (test_opt(sbi, RESERVE_ROOT) &&
- F2FS_OPTION(sbi).root_reserved_blocks > limit) {
- F2FS_OPTION(sbi).root_reserved_blocks = limit;
+ F2FS_OPTION(sbi).root_reserved_blocks > block_limit) {
+ F2FS_OPTION(sbi).root_reserved_blocks = block_limit;
f2fs_info(sbi, "Reduce reserved blocks for root = %u",
F2FS_OPTION(sbi).root_reserved_blocks);
}
- if (!test_opt(sbi, RESERVE_ROOT) &&
+ if (test_opt(sbi, RESERVE_NODE) &&
+ F2FS_OPTION(sbi).root_reserved_nodes > node_limit) {
+ F2FS_OPTION(sbi).root_reserved_nodes = node_limit;
+ f2fs_info(sbi, "Reduce reserved nodes for root = %u",
+ F2FS_OPTION(sbi).root_reserved_nodes);
+ }
+ if (!test_opt(sbi, RESERVE_ROOT) && !test_opt(sbi, RESERVE_NODE) &&
(!uid_eq(F2FS_OPTION(sbi).s_resuid,
make_kuid(&init_user_ns, F2FS_DEF_RESUID)) ||
!gid_eq(F2FS_OPTION(sbi).s_resgid,
make_kgid(&init_user_ns, F2FS_DEF_RESGID))))
- f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
+ f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root"
+ " and reserve_node",
from_kuid_munged(&init_user_ns,
F2FS_OPTION(sbi).s_resuid),
from_kgid_munged(&init_user_ns,
@@ -847,6 +868,11 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32;
ctx->spec_mask |= F2FS_SPEC_reserve_root;
break;
+ case Opt_reserve_node:
+ ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_NODE);
+ F2FS_CTX_INFO(ctx).root_reserved_nodes = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_reserve_node;
+ break;
case Opt_resuid:
F2FS_CTX_INFO(ctx).s_resuid = result.uid;
ctx->spec_mask |= F2FS_SPEC_resuid;
@@ -994,6 +1020,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_enable:
+ F2FS_CTX_INFO(ctx).unusable_cap_perc = 0;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc;
+ F2FS_CTX_INFO(ctx).unusable_cap = 0;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap;
ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
default:
@@ -1149,6 +1179,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_nat_bits:
ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS);
break;
+ case Opt_lookup_mode:
+ F2FS_CTX_INFO(ctx).lookup_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_lookup_mode;
+ break;
}
return 0;
}
@@ -1191,7 +1225,11 @@ static int f2fs_check_quota_consistency(struct fs_context *fc,
goto err_jquota_change;
if (old_qname) {
- if (strcmp(old_qname, new_qname) == 0) {
+ if (!new_qname) {
+ f2fs_info(sbi, "remove qf_name %s",
+ old_qname);
+ continue;
+ } else if (strcmp(old_qname, new_qname) == 0) {
ctx->qname_mask &= ~(1 << i);
continue;
}
@@ -1430,6 +1468,14 @@ static int f2fs_check_opt_consistency(struct fs_context *fc,
ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT);
ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT;
}
+ if (test_opt(sbi, RESERVE_NODE) &&
+ (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) &&
+ ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) {
+ f2fs_info(sbi, "Preserve previous reserve_node=%u",
+ F2FS_OPTION(sbi).root_reserved_nodes);
+ ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE);
+ ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE;
+ }
err = f2fs_check_test_dummy_encryption(fc, sb);
if (err)
@@ -1629,6 +1675,9 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb)
if (ctx->spec_mask & F2FS_SPEC_reserve_root)
F2FS_OPTION(sbi).root_reserved_blocks =
F2FS_CTX_INFO(ctx).root_reserved_blocks;
+ if (ctx->spec_mask & F2FS_SPEC_reserve_node)
+ F2FS_OPTION(sbi).root_reserved_nodes =
+ F2FS_CTX_INFO(ctx).root_reserved_nodes;
if (ctx->spec_mask & F2FS_SPEC_resgid)
F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid;
if (ctx->spec_mask & F2FS_SPEC_resuid)
@@ -1658,6 +1707,8 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb)
F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode;
if (ctx->spec_mask & F2FS_SPEC_errors)
F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors;
+ if (ctx->spec_mask & F2FS_SPEC_lookup_mode)
+ F2FS_OPTION(sbi).lookup_mode = F2FS_CTX_INFO(ctx).lookup_mode;
f2fs_apply_compression(fc, sb);
f2fs_apply_test_dummy_encryption(fc, sb);
@@ -2349,9 +2400,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
seq_puts(seq, "fragment:block");
seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
- if (test_opt(sbi, RESERVE_ROOT))
- seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
+ if (test_opt(sbi, RESERVE_ROOT) || test_opt(sbi, RESERVE_NODE))
+ seq_printf(seq, ",reserve_root=%u,reserve_node=%u,resuid=%u,"
+ "resgid=%u",
F2FS_OPTION(sbi).root_reserved_blocks,
+ F2FS_OPTION(sbi).root_reserved_nodes,
from_kuid_munged(&init_user_ns,
F2FS_OPTION(sbi).s_resuid),
from_kgid_munged(&init_user_ns,
@@ -2422,6 +2475,13 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, NAT_BITS))
seq_puts(seq, ",nat_bits");
+ if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_PERF)
+ seq_show_option(seq, "lookup_mode", "perf");
+ else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_COMPAT)
+ seq_show_option(seq, "lookup_mode", "compat");
+ else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_AUTO)
+ seq_show_option(seq, "lookup_mode", "auto");
+
return 0;
}
@@ -2486,6 +2546,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
#endif
f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL);
+
+ F2FS_OPTION(sbi).lookup_mode = LOOKUP_PERF;
}
#ifdef CONFIG_QUOTA
@@ -2566,21 +2628,39 @@ out_unlock:
restore_flag:
sbi->gc_mode = gc_mode;
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
+ f2fs_info(sbi, "f2fs_disable_checkpoint() finish, err:%d", err);
return err;
}
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
- int retry = DEFAULT_RETRY_IO_COUNT;
+ unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16;
+ long long start, writeback, end;
+
+ f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld",
+ get_pages(sbi, F2FS_DIRTY_META),
+ get_pages(sbi, F2FS_DIRTY_NODES),
+ get_pages(sbi, F2FS_DIRTY_DATA));
+
+ f2fs_update_time(sbi, ENABLE_TIME);
+
+ start = ktime_get();
/* we should flush all the data to keep data consistency */
- do {
- sync_inodes_sb(sbi->sb);
+ while (get_pages(sbi, F2FS_DIRTY_DATA)) {
+ writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC);
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
- } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
- if (unlikely(retry < 0))
- f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
+ if (f2fs_time_over(sbi, ENABLE_TIME))
+ break;
+ }
+ writeback = ktime_get();
+
+ sync_inodes_sb(sbi->sb);
+
+ if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA)))
+ f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld",
+ get_pages(sbi, F2FS_DIRTY_DATA));
f2fs_down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
@@ -2593,6 +2673,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* Let's ensure there's no pending checkpoint anymore */
f2fs_flush_ckpt_thread(sbi);
+
+ end = ktime_get();
+
+ f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu",
+ ktime_ms_delta(writeback, start),
+ ktime_ms_delta(end, writeback));
}
static int __f2fs_remount(struct fs_context *fc, struct super_block *sb)
@@ -4156,6 +4242,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->total_node_count = SEGS_TO_BLKS(sbi,
((le32_to_cpu(raw_super->segment_count_nat) / 2) *
NAT_ENTRY_PER_BLOCK));
+ sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count);
+ sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT;
F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino);
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
@@ -4179,6 +4267,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
+ sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL;
sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
DEF_UMOUNT_DISCARD_TIMEOUT;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -4637,9 +4726,11 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev);
sbi->aligned_blksize = true;
+ sbi->bggc_io_aware = AWARE_ALL_IO;
#ifdef CONFIG_BLK_DEV_ZONED
sbi->max_open_zones = UINT_MAX;
sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ;
+ sbi->bggc_io_aware = AWARE_READ_IO;
#endif
for (i = 0; i < max_devices; i++) {
@@ -4667,6 +4758,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
SEGS_TO_BLKS(sbi,
FDEV(i).total_segments) - 1 +
le32_to_cpu(raw_super->segment0_blkaddr);
+ sbi->allocate_section_hint = FDEV(i).total_segments /
+ SEGS_PER_SEC(sbi);
} else {
FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
FDEV(i).end_blk = FDEV(i).start_blk +
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f736052dea50..6d2a4fba68a2 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -281,6 +281,22 @@ static ssize_t encoding_flags_show(struct f2fs_attr *a,
le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags));
}
+static ssize_t effective_lookup_mode_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ switch (F2FS_OPTION(sbi).lookup_mode) {
+ case LOOKUP_PERF:
+ return sysfs_emit(buf, "perf\n");
+ case LOOKUP_COMPAT:
+ return sysfs_emit(buf, "compat\n");
+ case LOOKUP_AUTO:
+ if (sb_no_casefold_compat_fallback(sbi->sb))
+ return sysfs_emit(buf, "auto:perf\n");
+ return sysfs_emit(buf, "auto:compat\n");
+ }
+ return 0;
+}
+
static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -866,6 +882,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "bggc_io_aware")) {
+ if (t < AWARE_ALL_IO || t > AWARE_NONE)
+ return -EINVAL;
+ sbi->bggc_io_aware = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "allocate_section_hint")) {
+ if (t < 0 || t > MAIN_SECS(sbi))
+ return -EINVAL;
+ sbi->allocate_section_hint = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "allocate_section_policy")) {
+ if (t < ALLOCATE_FORWARD_NOHINT || t > ALLOCATE_FORWARD_FROM_HINT)
+ return -EINVAL;
+ sbi->allocate_section_policy = t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -1138,6 +1175,8 @@ F2FS_SBI_GENERAL_RW_ATTR(max_victim_search);
F2FS_SBI_GENERAL_RW_ATTR(migration_granularity);
F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity);
F2FS_SBI_GENERAL_RW_ATTR(dir_level);
+F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint);
+F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy);
#ifdef CONFIG_F2FS_IOSTAT
F2FS_SBI_GENERAL_RW_ATTR(iostat_enable);
F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms);
@@ -1175,6 +1214,7 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
#endif
F2FS_SBI_GENERAL_RW_ATTR(carve_out);
F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
+F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
@@ -1211,6 +1251,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(encoding_flags);
+F2FS_GENERAL_RO_ATTR(effective_lookup_mode);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
@@ -1303,6 +1344,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
ATTR_LIST(umount_discard_timeout),
+ ATTR_LIST(bggc_io_aware),
#ifdef CONFIG_F2FS_IOSTAT
ATTR_LIST(iostat_enable),
ATTR_LIST(iostat_period_ms),
@@ -1329,6 +1371,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(current_reserved_blocks),
ATTR_LIST(encoding),
ATTR_LIST(encoding_flags),
+ ATTR_LIST(effective_lookup_mode),
ATTR_LIST(mounted_time_sec),
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(cp_foreground_calls),
@@ -1371,6 +1414,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(max_read_extent_count),
ATTR_LIST(carve_out),
ATTR_LIST(reserved_pin_section),
+ ATTR_LIST(allocate_section_hint),
+ ATTR_LIST(allocate_section_policy),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1723,12 +1768,15 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
seq_printf(seq, " Main : 0x%010x (%10d)\n",
SM_I(sbi)->main_blkaddr,
le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main));
- seq_printf(seq, " # of Sections : %12d\n",
- le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count));
+ seq_printf(seq, " Block size : %12lu KB\n", F2FS_BLKSIZE >> 10);
+ seq_printf(seq, " Segment size : %12d MB\n",
+ (BLKS_PER_SEG(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10);
seq_printf(seq, " Segs/Sections : %12d\n",
SEGS_PER_SEC(sbi));
seq_printf(seq, " Section size : %12d MB\n",
- SEGS_PER_SEC(sbi) << 1);
+ (BLKS_PER_SEC(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10);
+ seq_printf(seq, " # of Sections : %12d\n",
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count));
if (!f2fs_is_multi_device(sbi))
return 0;
@@ -1742,6 +1790,69 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
return 0;
}
+static int __maybe_unused donation_list_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+ struct dentry *dentry;
+ char *buf, *path;
+ int i;
+
+ buf = f2fs_getname(sbi);
+ if (!buf)
+ return 0;
+
+ seq_printf(seq, "Donation List\n");
+ seq_printf(seq, " # of files : %u\n", sbi->donate_files);
+ seq_printf(seq, " %-50s %10s %20s %20s %22s\n",
+ "File path", "Status", "Donation offset (kb)",
+ "Donation size (kb)", "File cached size (kb)");
+ seq_printf(seq, "---\n");
+
+ for (i = 0; i < sbi->donate_files; i++) {
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ if (list_empty(&sbi->inode_list[DONATE_INODE])) {
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+ break;
+ }
+ fi = list_first_entry(&sbi->inode_list[DONATE_INODE],
+ struct f2fs_inode_info, gdonate_list);
+ list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+
+ if (!inode)
+ continue;
+
+ inode_lock_shared(inode);
+
+ dentry = d_find_alias(inode);
+ if (!dentry) {
+ path = NULL;
+ } else {
+ path = dentry_path_raw(dentry, buf, PATH_MAX);
+ if (IS_ERR(path))
+ goto next;
+ }
+ seq_printf(seq, " %-50s %10s %20llu %20llu %22llu\n",
+ path ? path : "<unlinked>",
+ is_inode_flag_set(inode, FI_DONATE_FINISHED) ?
+ "Evicted" : "Donated",
+ (loff_t)fi->donate_start << (PAGE_SHIFT - 10),
+ (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10),
+ (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10));
+next:
+ dput(dentry);
+ inode_unlock_shared(inode);
+ iput(inode);
+ }
+ f2fs_putname(buf);
+ return 0;
+}
+
#ifdef CONFIG_F2FS_FAULT_INJECTION
static int __maybe_unused inject_stats_seq_show(struct seq_file *seq,
void *offset)
@@ -1851,6 +1962,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
discard_plist_seq_show, sb);
proc_create_single_data("disk_map", 0444, sbi->s_proc,
disk_map_seq_show, sb);
+ proc_create_single_data("donation_list", 0444, sbi->s_proc,
+ donation_list_seq_show, sb);
#ifdef CONFIG_F2FS_FAULT_INJECTION
proc_create_single_data("inject_stats", 0444, sbi->s_proc,
inject_stats_seq_show, sb);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index acbec5bdd521..92b091783966 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1209,7 +1209,7 @@ EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
int *nr_cluster, struct msdos_dir_entry **de,
- struct buffer_head **bh, loff_t *i_pos)
+ struct buffer_head **bh)
{
struct super_block *sb = dir->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -1269,7 +1269,6 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
get_bh(bhs[n]);
*bh = bhs[n];
*de = (struct msdos_dir_entry *)((*bh)->b_data + offset);
- *i_pos = fat_make_i_pos(sb, *bh, *de);
/* Second stage: clear the rest of cluster, and write outs */
err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE);
@@ -1298,7 +1297,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
struct msdos_dir_entry *de;
int err, free_slots, i, nr_bhs;
- loff_t pos, i_pos;
+ loff_t pos;
sinfo->nr_slots = nr_slots;
@@ -1386,7 +1385,7 @@ found:
* add the cluster to dir.
*/
cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster,
- &de, &bh, &i_pos);
+ &de, &bh);
if (cluster < 0) {
err = cluster;
goto error_remove;
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 666e61753aed..93b7ebf8d927 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -161,25 +161,24 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
EXPORT_SYMBOL(vfs_parse_fs_param);
/**
- * vfs_parse_fs_string - Convenience function to just parse a string.
+ * vfs_parse_fs_qstr - Convenience function to just parse a string.
* @fc: Filesystem context.
* @key: Parameter name.
* @value: Default value.
- * @v_size: Maximum number of bytes in the value.
*/
-int vfs_parse_fs_string(struct fs_context *fc, const char *key,
- const char *value, size_t v_size)
+int vfs_parse_fs_qstr(struct fs_context *fc, const char *key,
+ const struct qstr *value)
{
int ret;
struct fs_parameter param = {
.key = key,
.type = fs_value_is_flag,
- .size = v_size,
+ .size = value ? value->len : 0,
};
if (value) {
- param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+ param.string = kmemdup_nul(value->name, value->len, GFP_KERNEL);
if (!param.string)
return -ENOMEM;
param.type = fs_value_is_string;
@@ -189,7 +188,7 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key,
kfree(param.string);
return ret;
}
-EXPORT_SYMBOL(vfs_parse_fs_string);
+EXPORT_SYMBOL(vfs_parse_fs_qstr);
/**
* vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data
@@ -218,16 +217,14 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
while ((key = sep(&options)) != NULL) {
if (*key) {
- size_t v_len = 0;
char *value = strchr(key, '=');
if (value) {
if (unlikely(value == key))
continue;
*value++ = 0;
- v_len = strlen(value);
}
- ret = vfs_parse_fs_string(fc, key, value, v_len);
+ ret = vfs_parse_fs_string(fc, key, value);
if (ret < 0)
break;
}
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index a774166264de..3a4ae632c94a 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -13,7 +13,7 @@ config FUSE_FS
although chances are your distribution already has that library
installed if you've installed the "fuse" package itself.
- See <file:Documentation/filesystems/fuse.rst> for more information.
+ See <file:Documentation/filesystems/fuse/fuse.rst> for more information.
See <file:Documentation/Changes> for needed library/utility version.
If you want to develop a userspace FS, or if you want to use
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 3f0f312a31c1..22ad9538dfc4 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -10,10 +10,11 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-y := trace.o # put trace.o first so we see ftrace errors sooner
+fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
-fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
+fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o
fuse-$(CONFIG_SYSCTL) += sysctl.o
fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
new file mode 100644
index 000000000000..4afda419dd14
--- /dev/null
+++ b/fs/fuse/backing.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE passthrough to backing file.
+ *
+ * Copyright (c) 2023 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/file.h>
+
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+ if (fb && refcount_inc_not_zero(&fb->count))
+ return fb;
+ return NULL;
+}
+
+static void fuse_backing_free(struct fuse_backing *fb)
+{
+ pr_debug("%s: fb=0x%p\n", __func__, fb);
+
+ if (fb->file)
+ fput(fb->file);
+ put_cred(fb->cred);
+ kfree_rcu(fb, rcu);
+}
+
+void fuse_backing_put(struct fuse_backing *fb)
+{
+ if (fb && refcount_dec_and_test(&fb->count))
+ fuse_backing_free(fb);
+}
+
+void fuse_backing_files_init(struct fuse_conn *fc)
+{
+ idr_init(&fc->backing_files_map);
+}
+
+static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&fc->lock);
+ /* FIXME: xarray might be space inefficient */
+ id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
+ spin_unlock(&fc->lock);
+ idr_preload_end();
+
+ WARN_ON_ONCE(id == 0);
+ return id;
+}
+
+static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
+ int id)
+{
+ struct fuse_backing *fb;
+
+ spin_lock(&fc->lock);
+ fb = idr_remove(&fc->backing_files_map, id);
+ spin_unlock(&fc->lock);
+
+ return fb;
+}
+
+static int fuse_backing_id_free(int id, void *p, void *data)
+{
+ struct fuse_backing *fb = p;
+
+ WARN_ON_ONCE(refcount_read(&fb->count) != 1);
+ fuse_backing_free(fb);
+ return 0;
+}
+
+void fuse_backing_files_free(struct fuse_conn *fc)
+{
+ idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
+ idr_destroy(&fc->backing_files_map);
+}
+
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
+{
+ struct file *file;
+ struct super_block *backing_sb;
+ struct fuse_backing *fb = NULL;
+ int res;
+
+ pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ res = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ res = -EINVAL;
+ if (map->flags || map->padding)
+ goto out;
+
+ file = fget_raw(map->fd);
+ res = -EBADF;
+ if (!file)
+ goto out;
+
+ /* read/write/splice/mmap passthrough only relevant for regular files */
+ res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+ if (!d_is_reg(file->f_path.dentry))
+ goto out_fput;
+
+ backing_sb = file_inode(file)->i_sb;
+ res = -ELOOP;
+ if (backing_sb->s_stack_depth >= fc->max_stack_depth)
+ goto out_fput;
+
+ fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
+ res = -ENOMEM;
+ if (!fb)
+ goto out_fput;
+
+ fb->file = file;
+ fb->cred = prepare_creds();
+ refcount_set(&fb->count, 1);
+
+ res = fuse_backing_id_alloc(fc, fb);
+ if (res < 0) {
+ fuse_backing_free(fb);
+ fb = NULL;
+ }
+
+out:
+ pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
+
+ return res;
+
+out_fput:
+ fput(file);
+ goto out;
+}
+
+int fuse_backing_close(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb = NULL;
+ int err;
+
+ pr_debug("%s: backing_id=%d\n", __func__, backing_id);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ err = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ err = -EINVAL;
+ if (backing_id <= 0)
+ goto out;
+
+ err = -ENOENT;
+ fb = fuse_backing_id_remove(fc, backing_id);
+ if (!fb)
+ goto out;
+
+ fuse_backing_put(fb);
+ err = 0;
+out:
+ pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
+
+ return err;
+}
+
+struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb;
+
+ rcu_read_lock();
+ fb = idr_find(&fc->backing_files_map, backing_id);
+ fb = fuse_backing_get(fb);
+ rcu_read_unlock();
+
+ return fb;
+}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b39844d75a80..28c96961e85d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#define CUSE_CONNTBL_LEN 64
@@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
*/
static int cuse_channel_release(struct inode *inode, struct file *file)
{
- struct fuse_dev *fud = file->private_data;
+ struct fuse_dev *fud = __fuse_get_dev(file);
struct cuse_conn *cc = fc_to_cc(fud->fc);
/* remove from the conntbl, no more access from this point on */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ad8645c0f9fe..132f38619d70 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,7 +25,6 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
-#define CREATE_TRACE_POINTS
#include "fuse_trace.h"
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
@@ -207,8 +206,9 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap,
if (fuse_block_alloc(fc, for_background)) {
err = -EINTR;
- if (wait_event_killable_exclusive(fc->blocked_waitq,
- !fuse_block_alloc(fc, for_background)))
+ if (wait_event_state_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background),
+ (TASK_KILLABLE | TASK_FREEZABLE)))
goto out;
}
/* Matches smp_wmb() in fuse_set_initialized() */
@@ -322,6 +322,7 @@ unsigned int fuse_req_hash(u64 unique)
{
return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
+EXPORT_SYMBOL_GPL(fuse_req_hash);
/*
* A new request is available, wake fiq->waitq
@@ -369,12 +370,32 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
}
}
+static inline void fuse_request_assign_unique_locked(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+{
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique_locked(fiq);
+
+ /* tracepoint captures in.h.unique and in.h.len */
+ trace_fuse_request_send(req);
+}
+
+inline void fuse_request_assign_unique(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+{
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
+ /* tracepoint captures in.h.unique and in.h.len */
+ trace_fuse_request_send(req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_assign_unique);
+
static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->lock);
if (fiq->connected) {
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique_locked(fiq);
+ fuse_request_assign_unique_locked(fiq, req);
list_add_tail(&req->list, &fiq->pending);
fuse_dev_wake_and_unlock(fiq);
} else {
@@ -397,7 +418,6 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req)
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
- trace_fuse_request_send(req);
fiq->ops->send_req(fiq, req);
}
@@ -687,10 +707,10 @@ static bool fuse_request_queue_background_uring(struct fuse_conn *fc,
{
struct fuse_iqueue *fiq = &fc->iq;
- req->in.h.unique = fuse_get_unique(fiq);
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
+ fuse_request_assign_unique(fiq, req);
return fuse_uring_queue_bq_req(req);
}
@@ -1528,14 +1548,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file)
return 0;
}
+struct fuse_dev *fuse_get_dev(struct file *file)
+{
+ struct fuse_dev *fud = __fuse_get_dev(file);
+ int err;
+
+ if (likely(fud))
+ return fud;
+
+ err = wait_event_interruptible(fuse_dev_waitq,
+ READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT);
+ if (err)
+ return ERR_PTR(err);
+
+ fud = __fuse_get_dev(file);
+ if (!fud)
+ return ERR_PTR(-EPERM);
+
+ return fud;
+}
+
static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
{
struct fuse_copy_state cs;
struct file *file = iocb->ki_filp;
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!user_backed_iter(to))
return -EINVAL;
@@ -1555,8 +1595,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
struct fuse_copy_state cs;
struct fuse_dev *fud = fuse_get_dev(in);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
GFP_KERNEL);
@@ -1600,35 +1640,31 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_poll_wakeup_out outarg;
- int err = -EINVAL;
+ int err;
if (size != sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
fuse_copy_finish(cs);
return fuse_notify_poll_wakeup(fc, &outarg);
-
-err:
- fuse_copy_finish(cs);
- return err;
}
static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_inval_inode_out outarg;
- int err = -EINVAL;
+ int err;
if (size != sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
fuse_copy_finish(cs);
down_read(&fc->killsb);
@@ -1636,10 +1672,6 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
outarg.off, outarg.len);
up_read(&fc->killsb);
return err;
-
-err:
- fuse_copy_finish(cs);
- return err;
}
static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
@@ -1647,29 +1679,26 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
{
struct fuse_notify_inval_entry_out outarg;
int err;
- char *buf = NULL;
+ char *buf;
struct qstr name;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
- err = -ENAMETOOLONG;
if (outarg.namelen > fc->name_max)
- goto err;
+ return -ENAMETOOLONG;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
- goto err;
+ return -EINVAL;
- err = -ENOMEM;
buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
if (!buf)
- goto err;
+ return -ENOMEM;
name.name = buf;
name.len = outarg.namelen;
@@ -1682,12 +1711,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
up_read(&fc->killsb);
- kfree(buf);
- return err;
-
err:
kfree(buf);
- fuse_copy_finish(cs);
return err;
}
@@ -1696,29 +1721,25 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
{
struct fuse_notify_delete_out outarg;
int err;
- char *buf = NULL;
+ char *buf;
struct qstr name;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
- err = -ENAMETOOLONG;
if (outarg.namelen > fc->name_max)
- goto err;
+ return -ENAMETOOLONG;
- err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
- goto err;
+ return -EINVAL;
- err = -ENOMEM;
buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
if (!buf)
- goto err;
+ return -ENOMEM;
name.name = buf;
name.len = outarg.namelen;
@@ -1731,12 +1752,8 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
up_read(&fc->killsb);
- kfree(buf);
- return err;
-
err:
kfree(buf);
- fuse_copy_finish(cs);
return err;
}
@@ -1754,17 +1771,15 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
loff_t file_size;
loff_t end;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto out_finish;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto out_finish;
+ return err;
- err = -EINVAL;
if (size - sizeof(outarg) != outarg.size)
- goto out_finish;
+ return -EINVAL;
nodeid = outarg.nodeid;
@@ -1824,8 +1839,6 @@ out_iput:
iput(inode);
out_up_killsb:
up_read(&fc->killsb);
-out_finish:
- fuse_copy_finish(cs);
return err;
}
@@ -1940,13 +1953,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
u64 nodeid;
int err;
- err = -EINVAL;
if (size != sizeof(outarg))
- goto copy_finish;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto copy_finish;
+ return err;
fuse_copy_finish(cs);
@@ -1962,10 +1974,6 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
up_read(&fc->killsb);
return err;
-
-copy_finish:
- fuse_copy_finish(cs);
- return err;
}
/*
@@ -2044,6 +2052,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc)
return 0;
}
+static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_prune_out outarg;
+ const unsigned int batch = 512;
+ u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL);
+ unsigned int num, i;
+ int err;
+
+ if (!nodeids)
+ return -ENOMEM;
+
+ if (size < sizeof(outarg))
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ return err;
+
+ if (size - sizeof(outarg) != outarg.count * sizeof(u64))
+ return -EINVAL;
+
+ for (; outarg.count; outarg.count -= num) {
+ num = min(batch, outarg.count);
+ err = fuse_copy_one(cs, nodeids, num * sizeof(u64));
+ if (err)
+ return err;
+
+ scoped_guard(rwsem_read, &fc->killsb) {
+ for (i = 0; i < num; i++)
+ fuse_try_prune_one_inode(fc, nodeids[i]);
+ }
+ }
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
@@ -2075,8 +2119,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_INC_EPOCH:
return fuse_notify_inc_epoch(fc);
+ case FUSE_NOTIFY_PRUNE:
+ return fuse_notify_prune(fc, size, cs);
+
default:
- fuse_copy_finish(cs);
return -EINVAL;
}
}
@@ -2156,7 +2202,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
*/
if (!oh.unique) {
err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
- goto out;
+ goto copy_finish;
}
err = -EINVAL;
@@ -2229,7 +2275,7 @@ copy_finish:
static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
{
struct fuse_copy_state cs;
- struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+ struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp);
if (!fud)
return -EPERM;
@@ -2251,11 +2297,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
unsigned idx;
struct pipe_buffer *bufs;
struct fuse_copy_state cs;
- struct fuse_dev *fud;
+ struct fuse_dev *fud = __fuse_get_dev(out);
size_t rem;
ssize_t ret;
- fud = fuse_get_dev(out);
if (!fud)
return -EPERM;
@@ -2341,7 +2386,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
struct fuse_iqueue *fiq;
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
+ if (IS_ERR(fud))
return EPOLLERR;
fiq = &fud->fc->iq;
@@ -2394,7 +2439,7 @@ static void end_polls(struct fuse_conn *fc)
* The same effect is usually achievable through killing the filesystem daemon
* and all users of the filesystem. The exception is the combination of an
* asynchronous request and the tricky deadlock (see
- * Documentation/filesystems/fuse.rst).
+ * Documentation/filesystems/fuse/fuse.rst).
*
* Aborting requests under I/O goes as follows: 1: Separate out unlocked
* requests, they should be finished off immediately. Locked requests will be
@@ -2488,7 +2533,7 @@ void fuse_wait_aborted(struct fuse_conn *fc)
int fuse_dev_release(struct inode *inode, struct file *file)
{
- struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_dev *fud = __fuse_get_dev(file);
if (fud) {
struct fuse_conn *fc = fud->fc;
@@ -2519,8 +2564,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
{
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
/* No locking - fasync_helper does its own locking */
return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
@@ -2530,7 +2575,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
{
struct fuse_dev *fud;
- if (new->private_data)
+ if (__fuse_get_dev(new))
return -EINVAL;
fud = fuse_dev_alloc_install(fc);
@@ -2561,7 +2606,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
* uses the same ioctl handler.
*/
if (fd_file(f)->f_op == file->f_op)
- fud = fuse_get_dev(fd_file(f));
+ fud = __fuse_get_dev(fd_file(f));
res = -EINVAL;
if (fud) {
@@ -2579,8 +2624,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file,
struct fuse_dev *fud = fuse_get_dev(file);
struct fuse_backing_map map;
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
return -EOPNOTSUPP;
@@ -2596,8 +2641,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
struct fuse_dev *fud = fuse_get_dev(file);
int backing_id;
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
return -EOPNOTSUPP;
@@ -2608,6 +2653,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
return fuse_backing_close(fud->fc, backing_id);
}
+static long fuse_dev_ioctl_sync_init(struct file *file)
+{
+ int err = -EINVAL;
+
+ mutex_lock(&fuse_mutex);
+ if (!__fuse_get_dev(file)) {
+ WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT);
+ err = 0;
+ }
+ mutex_unlock(&fuse_mutex);
+ return err;
+}
+
static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -2623,6 +2681,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
case FUSE_DEV_IOC_BACKING_CLOSE:
return fuse_dev_ioctl_backing_close(file, argp);
+ case FUSE_DEV_IOC_SYNC_INIT:
+ return fuse_dev_ioctl_sync_init(file);
+
default:
return -ENOTTY;
}
@@ -2631,7 +2692,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
#ifdef CONFIG_PROC_FS
static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file)
{
- struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_dev *fud = __fuse_get_dev(file);
if (!fud)
return;
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index a30c44234a4e..f6b12aebb8bb 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -7,6 +7,7 @@
#include "fuse_i.h"
#include "dev_uring_i.h"
#include "fuse_dev_i.h"
+#include "fuse_trace.h"
#include <linux/fs.h>
#include <linux/io_uring/cmd.h>
@@ -1139,9 +1140,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
return -EINVAL;
fud = fuse_get_dev(cmd->file);
- if (!fud) {
+ if (IS_ERR(fud)) {
pr_info_ratelimited("No fuse device found\n");
- return -ENOTCONN;
+ return PTR_ERR(fud);
}
fc = fud->fc;
@@ -1268,8 +1269,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
if (!queue)
goto err;
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique(fiq);
+ fuse_request_assign_unique(fiq, req);
spin_lock(&queue->lock);
err = -ENOTCONN;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5c569c3cb53f..ecaec0fea3a1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -739,22 +739,18 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
int err;
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct fuse_conn *fc = get_fuse_conn(dir);
- struct dentry *res = NULL;
if (fuse_is_bad(dir))
return -EIO;
if (d_in_lookup(entry)) {
- res = fuse_lookup(dir, entry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- entry = res;
+ struct dentry *res = fuse_lookup(dir, entry, 0);
+ if (res || d_really_is_positive(entry))
+ return finish_no_open(file, res);
}
- if (!(flags & O_CREAT) || d_really_is_positive(entry))
- goto no_open;
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
/* Only creates */
file->f_mode |= FMODE_CREATED;
@@ -768,16 +764,13 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
goto mknod;
} else if (err == -EEXIST)
fuse_invalidate_entry(entry);
-out_dput:
- dput(res);
return err;
mknod:
err = fuse_mknod(idmap, dir, entry, mode, 0);
if (err)
- goto out_dput;
-no_open:
- return finish_no_open(file, res);
+ return err;
+ return finish_no_open(file, NULL);
}
/*
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4adcf09d4b01..f1ef77a0be05 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -356,8 +356,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* Make the release synchronous if this is a fuseblk mount,
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
+ *
+ * Always use the asynchronous file put because the current thread
+ * might be the fuse server. This can happen if a process starts some
+ * aio and closes the fd before the aio completes. Since aio takes its
+ * own ref to the file, the IO completion has to drop the ref, which is
+ * how the fuse server can end up closing its clients' files.
*/
- fuse_file_put(ff, ff->fm->fc->destroy);
+ fuse_file_put(ff, false);
}
void fuse_release_common(struct file *file, bool isdir)
@@ -865,22 +871,20 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_args_pages *ap = &ia->ap;
size_t count = ia->read.in.size;
size_t num_read = args->out_args[0].size;
- struct address_space *mapping = NULL;
-
- for (i = 0; mapping == NULL && i < ap->num_folios; i++)
- mapping = ap->folios[i]->mapping;
+ struct address_space *mapping;
+ struct inode *inode;
- if (mapping) {
- struct inode *inode = mapping->host;
+ WARN_ON_ONCE(!ap->num_folios);
+ mapping = ap->folios[0]->mapping;
+ inode = mapping->host;
- /*
- * Short read means EOF. If file size is larger, truncate it
- */
- if (!err && num_read < count)
- fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (!err && num_read < count)
+ fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
- fuse_invalidate_atime(inode);
- }
+ fuse_invalidate_atime(inode);
for (i = 0; i < ap->num_folios; i++) {
folio_end_read(ap->folios[i], !err);
@@ -1175,7 +1179,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
num = min(iov_iter_count(ii), fc->max_write);
ap->args.in_pages = true;
- ap->descs[0].offset = offset;
while (num && ap->num_folios < max_folios) {
size_t tmp;
@@ -1823,19 +1826,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- for (i = 0; i < ap->num_folios; i++) {
+ for (i = 0; i < ap->num_folios; i++)
/*
* Benchmarks showed that ending writeback within the
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
iomap_finish_folio_write(inode, ap->folios[i], 1);
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- wb_writeout_inc(&bdi->wb);
- }
wake_up(&fi->page_waitq);
}
@@ -2010,14 +2009,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
uint32_t folio_index, loff_t offset, unsigned len)
{
- struct inode *inode = folio->mapping->host;
struct fuse_args_pages *ap = &wpa->ia.ap;
ap->folios[folio_index] = folio;
ap->descs[folio_index].offset = offset;
ap->descs[folio_index].length = len;
-
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
}
static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
@@ -2960,10 +2956,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
.nodeid_out = ff_out->nodeid,
.fh_out = ff_out->fh,
.off_out = pos_out,
- .len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
+ .len = len,
.flags = flags
};
struct fuse_write_out outarg;
+ struct fuse_copy_file_range_out outarg_64;
+ u64 bytes_copied;
ssize_t err;
/* mark unstable when write-back is not used, and file_out gets
* extended */
@@ -3013,33 +3011,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
- args.opcode = FUSE_COPY_FILE_RANGE;
+ args.opcode = FUSE_COPY_FILE_RANGE_64;
args.nodeid = ff_in->nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
- args.out_args[0].size = sizeof(outarg);
- args.out_args[0].value = &outarg;
+ args.out_args[0].size = sizeof(outarg_64);
+ args.out_args[0].value = &outarg_64;
+ if (fc->no_copy_file_range_64) {
+fallback:
+ /* Fall back to old op that can't handle large copy length */
+ args.opcode = FUSE_COPY_FILE_RANGE;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK);
+ }
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_copy_file_range = 1;
- err = -EOPNOTSUPP;
+ if (fc->no_copy_file_range_64) {
+ fc->no_copy_file_range = 1;
+ err = -EOPNOTSUPP;
+ } else {
+ fc->no_copy_file_range_64 = 1;
+ goto fallback;
+ }
}
- if (!err && outarg.size > len)
- err = -EIO;
-
if (err)
goto out;
+ bytes_copied = fc->no_copy_file_range_64 ?
+ outarg.size : outarg_64.bytes_copied;
+
+ if (bytes_copied > len) {
+ err = -EIO;
+ goto out;
+ }
+
truncate_inode_pages_range(inode_out->i_mapping,
ALIGN_DOWN(pos_out, PAGE_SIZE),
- ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
+ ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1);
file_update_time(file_out);
- fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
+ fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied);
- err = outarg.size;
+ err = bytes_copied;
out:
if (is_unstable)
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 5a9bd771a319..6e8373f97040 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -12,6 +12,8 @@
#define FUSE_INT_REQ_BIT (1ULL << 0)
#define FUSE_REQ_ID_STEP (1ULL << 1)
+extern struct wait_queue_head fuse_dev_waitq;
+
struct fuse_arg;
struct fuse_args;
struct fuse_pqueue;
@@ -37,15 +39,22 @@ struct fuse_copy_state {
} ring;
};
-static inline struct fuse_dev *fuse_get_dev(struct file *file)
+#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1)
+#define FUSE_DEV_PTR_MASK (~1UL)
+
+static inline struct fuse_dev *__fuse_get_dev(struct file *file)
{
/*
* Lockless access is OK, because file->private data is set
* once during mount and is valid until the file is released.
*/
- return READ_ONCE(file->private_data);
+ struct fuse_dev *fud = READ_ONCE(file->private_data);
+
+ return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK);
}
+struct fuse_dev *fuse_get_dev(struct file *file);
+
unsigned int fuse_req_hash(u64 unique);
struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index cc428d04be3e..c2f2a48156d6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -856,6 +856,9 @@ struct fuse_conn {
/** Does the filesystem support copy_file_range? */
unsigned no_copy_file_range:1;
+ /** Does the filesystem support copy_file_range_64? */
+ unsigned no_copy_file_range_64:1;
+
/* Send DESTROY request */
unsigned int destroy:1;
@@ -901,6 +904,9 @@ struct fuse_conn {
/* Is link not implemented by fs? */
unsigned int no_link:1;
+ /* Is synchronous FUSE_INIT allowed? */
+ unsigned int sync_init:1;
+
/* Use io_uring for communication */
unsigned int io_uring;
@@ -1255,6 +1261,11 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
/**
+ * Assign a unique id to a fuse request
+ */
+void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req);
+
+/**
* End a finished request
*/
void fuse_request_end(struct fuse_req *req);
@@ -1315,7 +1326,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_mount *fm);
+int fuse_send_init(struct fuse_mount *fm);
/**
* Fill in superblock and initialize fuse connection
@@ -1407,6 +1418,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name, u32 flags);
+/*
+ * Try to prune this inode. If neither the inode itself nor dentries associated
+ * with this inode have any external reference, then the inode can be freed.
+ */
+void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid);
+
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
@@ -1512,29 +1529,11 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir);
-/* passthrough.c */
-static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
-{
-#ifdef CONFIG_FUSE_PASSTHROUGH
- return READ_ONCE(fi->fb);
-#else
- return NULL;
-#endif
-}
-
-static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
- struct fuse_backing *fb)
-{
-#ifdef CONFIG_FUSE_PASSTHROUGH
- return xchg(&fi->fb, fb);
-#else
- return NULL;
-#endif
-}
-
+/* backing.c */
#ifdef CONFIG_FUSE_PASSTHROUGH
struct fuse_backing *fuse_backing_get(struct fuse_backing *fb);
void fuse_backing_put(struct fuse_backing *fb);
+struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id);
#else
static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
@@ -1545,6 +1544,11 @@ static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
static inline void fuse_backing_put(struct fuse_backing *fb)
{
}
+static inline struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc,
+ int backing_id)
+{
+ return NULL;
+}
#endif
void fuse_backing_files_init(struct fuse_conn *fc);
@@ -1552,9 +1556,27 @@ void fuse_backing_files_free(struct fuse_conn *fc);
int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map);
int fuse_backing_close(struct fuse_conn *fc, int backing_id);
-struct fuse_backing *fuse_passthrough_open(struct file *file,
- struct inode *inode,
- int backing_id);
+/* passthrough.c */
+static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return READ_ONCE(fi->fb);
+#else
+ return NULL;
+#endif
+}
+
+static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
+ struct fuse_backing *fb)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return xchg(&fi->fb, fb);
+#else
+ return NULL;
+#endif
+}
+
+struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id);
void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb);
static inline struct file *fuse_file_passthrough(struct fuse_file *ff)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7485a41af892..d1babf56f254 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,6 +7,7 @@
*/
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#include "dev_uring_i.h"
#include <linux/dax.h>
@@ -34,6 +35,7 @@ MODULE_LICENSE("GPL");
static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
+DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq);
static int set_global_limit(const char *val, const struct kernel_param *kp);
@@ -101,14 +103,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (!fi)
return NULL;
- fi->i_time = 0;
+ /* Initialize private data (i.e. everything except fi->inode) */
+ BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0);
+ memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode));
+
fi->inval_mask = ~0;
- fi->nodeid = 0;
- fi->nlookup = 0;
- fi->attr_version = 0;
- fi->orig_ino = 0;
- fi->state = 0;
- fi->submount_lookup = NULL;
mutex_init(&fi->mutex);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
@@ -586,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
return 0;
}
+void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid)
+{
+ struct inode *inode;
+
+ inode = fuse_ilookup(fc, nodeid, NULL);
+ if (!inode)
+ return;
+ d_prune_aliases(inode);
+ iput(inode);
+}
+
bool fuse_lock_inode(struct inode *inode)
{
bool locked = false;
@@ -1469,7 +1479,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
wake_up_all(&fc->blocked_waitq);
}
-void fuse_send_init(struct fuse_mount *fm)
+static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
u64 flags;
@@ -1528,10 +1538,30 @@ void fuse_send_init(struct fuse_mount *fm)
ia->args.out_args[0].value = &ia->out;
ia->args.force = true;
ia->args.nocreds = true;
- ia->args.end = process_init_reply;
- if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
- process_init_reply(fm, &ia->args, -ENOTCONN);
+ return ia;
+}
+
+int fuse_send_init(struct fuse_mount *fm)
+{
+ struct fuse_init_args *ia = fuse_new_init(fm);
+ int err;
+
+ if (fm->fc->sync_init) {
+ err = fuse_simple_request(fm, &ia->args);
+ /* Ignore size of init reply */
+ if (err > 0)
+ err = 0;
+ } else {
+ ia->args.end = process_init_reply;
+ err = fuse_simple_background(fm, &ia->args, GFP_KERNEL);
+ if (!err)
+ return 0;
+ }
+ process_init_reply(fm, &ia->args, err);
+ if (fm->fc->conn_error)
+ return -ENOTCONN;
+ return 0;
}
EXPORT_SYMBOL_GPL(fuse_send_init);
@@ -1561,8 +1591,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- /* fuse does it's own writeback accounting */
- sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
/*
@@ -1821,6 +1849,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
!sb_set_blocksize(sb, PAGE_SIZE))
goto err;
#endif
+ fc->sync_fs = 1;
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -1872,8 +1901,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_lock(&fuse_mutex);
err = -EINVAL;
- if (ctx->fudptr && *ctx->fudptr)
- goto err_unlock;
+ if (ctx->fudptr && *ctx->fudptr) {
+ if (*ctx->fudptr == FUSE_DEV_SYNC_INIT)
+ fc->sync_init = 1;
+ else
+ goto err_unlock;
+ }
err = fuse_ctl_add_conn(fc);
if (err)
@@ -1881,8 +1914,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
- if (ctx->fudptr)
+ if (ctx->fudptr) {
*ctx->fudptr = fud;
+ wake_up_all(&fuse_dev_waitq);
+ }
mutex_unlock(&fuse_mutex);
return 0;
@@ -1903,6 +1938,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common);
static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
+ struct fuse_mount *fm;
int err;
if (!ctx->file || !ctx->rootmode_present ||
@@ -1923,8 +1959,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
return err;
/* file->private_data shall be visible on all CPUs after this */
smp_mb();
- fuse_send_init(get_fuse_mount_super(sb));
- return 0;
+
+ fm = get_fuse_mount_super(sb);
+
+ return fuse_send_init(fm);
}
/*
@@ -1985,7 +2023,7 @@ static int fuse_get_tree(struct fs_context *fsc)
* Allow creating a fuse mount with an already initialized fuse
* connection
*/
- fud = READ_ONCE(ctx->file->private_data);
+ fud = __fuse_get_dev(ctx->file);
if (ctx->file->f_op == &fuse_dev_operations && fud) {
fsc->sget_key = fud->fc;
sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super);
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index c99e285f3183..3728933188f3 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -177,8 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
(ff->open_flags & ~FOPEN_PASSTHROUGH_MASK))
return -EINVAL;
- fb = fuse_passthrough_open(file, inode,
- ff->args->open_outarg.backing_id);
+ fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id);
if (IS_ERR(fb))
return PTR_ERR(fb);
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index eb97ac009e75..72de97c03d0e 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -144,171 +144,12 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
return backing_file_mmap(backing_file, vma, &ctx);
}
-struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
-{
- if (fb && refcount_inc_not_zero(&fb->count))
- return fb;
- return NULL;
-}
-
-static void fuse_backing_free(struct fuse_backing *fb)
-{
- pr_debug("%s: fb=0x%p\n", __func__, fb);
-
- if (fb->file)
- fput(fb->file);
- put_cred(fb->cred);
- kfree_rcu(fb, rcu);
-}
-
-void fuse_backing_put(struct fuse_backing *fb)
-{
- if (fb && refcount_dec_and_test(&fb->count))
- fuse_backing_free(fb);
-}
-
-void fuse_backing_files_init(struct fuse_conn *fc)
-{
- idr_init(&fc->backing_files_map);
-}
-
-static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
-{
- int id;
-
- idr_preload(GFP_KERNEL);
- spin_lock(&fc->lock);
- /* FIXME: xarray might be space inefficient */
- id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
- spin_unlock(&fc->lock);
- idr_preload_end();
-
- WARN_ON_ONCE(id == 0);
- return id;
-}
-
-static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
- int id)
-{
- struct fuse_backing *fb;
-
- spin_lock(&fc->lock);
- fb = idr_remove(&fc->backing_files_map, id);
- spin_unlock(&fc->lock);
-
- return fb;
-}
-
-static int fuse_backing_id_free(int id, void *p, void *data)
-{
- struct fuse_backing *fb = p;
-
- WARN_ON_ONCE(refcount_read(&fb->count) != 1);
- fuse_backing_free(fb);
- return 0;
-}
-
-void fuse_backing_files_free(struct fuse_conn *fc)
-{
- idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
- idr_destroy(&fc->backing_files_map);
-}
-
-int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
-{
- struct file *file;
- struct super_block *backing_sb;
- struct fuse_backing *fb = NULL;
- int res;
-
- pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
-
- /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
- res = -EPERM;
- if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
- goto out;
-
- res = -EINVAL;
- if (map->flags || map->padding)
- goto out;
-
- file = fget_raw(map->fd);
- res = -EBADF;
- if (!file)
- goto out;
-
- /* read/write/splice/mmap passthrough only relevant for regular files */
- res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
- if (!d_is_reg(file->f_path.dentry))
- goto out_fput;
-
- backing_sb = file_inode(file)->i_sb;
- res = -ELOOP;
- if (backing_sb->s_stack_depth >= fc->max_stack_depth)
- goto out_fput;
-
- fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
- res = -ENOMEM;
- if (!fb)
- goto out_fput;
-
- fb->file = file;
- fb->cred = prepare_creds();
- refcount_set(&fb->count, 1);
-
- res = fuse_backing_id_alloc(fc, fb);
- if (res < 0) {
- fuse_backing_free(fb);
- fb = NULL;
- }
-
-out:
- pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
-
- return res;
-
-out_fput:
- fput(file);
- goto out;
-}
-
-int fuse_backing_close(struct fuse_conn *fc, int backing_id)
-{
- struct fuse_backing *fb = NULL;
- int err;
-
- pr_debug("%s: backing_id=%d\n", __func__, backing_id);
-
- /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
- err = -EPERM;
- if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
- goto out;
-
- err = -EINVAL;
- if (backing_id <= 0)
- goto out;
-
- err = -ENOENT;
- fb = fuse_backing_id_remove(fc, backing_id);
- if (!fb)
- goto out;
-
- fuse_backing_put(fb);
- err = 0;
-out:
- pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
-
- return err;
-}
-
/*
* Setup passthrough to a backing file.
*
* Returns an fb object with elevated refcount to be stored in fuse inode.
*/
-struct fuse_backing *fuse_passthrough_open(struct file *file,
- struct inode *inode,
- int backing_id)
+struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
@@ -320,12 +161,8 @@ struct fuse_backing *fuse_passthrough_open(struct file *file,
if (backing_id <= 0)
goto out;
- rcu_read_lock();
- fb = idr_find(&fc->backing_files_map, backing_id);
- fb = fuse_backing_get(fb);
- rcu_read_unlock();
-
err = -ENOENT;
+ fb = fuse_backing_lookup(fc, backing_id);
if (!fb)
goto out;
diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c
new file mode 100644
index 000000000000..93bd72efc98c
--- /dev/null
+++ b/fs/fuse/trace.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "dev_uring_i.h"
+#include "fuse_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/pagemap.h>
+
+#define CREATE_TRACE_POINTS
+#include "fuse_trace.h"
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 76c8fd0bfc75..6bc7c97b017d 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -20,6 +20,7 @@
#include <linux/cleanup.h>
#include <linux/uio.h>
#include "fuse_i.h"
+#include "fuse_dev_i.h"
/* Used to help calculate the FUSE connection's max_pages limit for a request's
* size. Parts of the struct fuse_req are sliced into scattergather lists in
@@ -761,7 +762,6 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
static void virtio_fs_request_complete(struct fuse_req *req,
struct virtio_fs_vq *fsvq)
{
- struct fuse_pqueue *fpq = &fsvq->fud->pq;
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
@@ -790,9 +790,7 @@ static void virtio_fs_request_complete(struct fuse_req *req,
}
}
- spin_lock(&fpq->lock);
clear_bit(FR_SENT, &req->flags);
- spin_unlock(&fpq->lock);
fuse_request_end(req);
spin_lock(&fsvq->lock);
@@ -1384,7 +1382,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
unsigned int out_sgs = 0;
unsigned int in_sgs = 0;
unsigned int total_sgs;
- unsigned int i;
+ unsigned int i, hash;
int ret;
bool notify;
struct fuse_pqueue *fpq;
@@ -1444,8 +1442,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
/* Request successfully sent. */
fpq = &fsvq->fud->pq;
+ hash = fuse_req_hash(req->in.h.unique);
spin_lock(&fpq->lock);
- list_add_tail(&req->list, fpq->processing);
+ list_add_tail(&req->list, &fpq->processing[hash]);
spin_unlock(&fpq->lock);
set_bit(FR_SENT, &req->flags);
/* matches barrier in request_wait_answer() */
@@ -1480,8 +1479,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req)
struct virtio_fs_vq *fsvq;
int ret;
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique(fiq);
+ fuse_request_assign_unique(fiq, req);
clear_bit(FR_PENDING, &req->flags);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8760e7e20c9d..8a7ed80d9f2d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1368,27 +1368,19 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags,
umode_t mode)
{
- struct dentry *d;
bool excl = !!(flags & O_EXCL);
- if (!d_in_lookup(dentry))
- goto skip_lookup;
-
- d = __gfs2_lookup(dir, dentry, file);
- if (IS_ERR(d))
- return PTR_ERR(d);
- if (d != NULL)
- dentry = d;
- if (d_really_is_positive(dentry)) {
- if (!(file->f_mode & FMODE_OPENED))
+ if (d_in_lookup(dentry)) {
+ struct dentry *d = __gfs2_lookup(dir, dentry, file);
+ if (file->f_mode & FMODE_OPENED) {
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ dput(d);
+ return excl && (flags & O_CREAT) ? -EEXIST : 0;
+ }
+ if (d || d_really_is_positive(dentry))
return finish_no_open(file, d);
- dput(d);
- return excl && (flags & O_CREAT) ? -EEXIST : 0;
}
-
- BUG_ON(d != NULL);
-
-skip_lookup:
if (!(flags & O_CREAT))
return -ENOENT;
diff --git a/fs/internal.h b/fs/internal.h
index a33d18ee5b74..b5c62abefff4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -84,9 +84,9 @@ void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);
-int path_mount(const char *dev_name, struct path *path,
+int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page);
-int path_umount(struct path *path, int flags);
+int path_umount(const struct path *path, int flags);
int show_path(struct seq_file *m, struct dentry *root);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 38861ca04899..2d0719bf6d87 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -131,7 +131,7 @@ __flush_batch(journal_t *journal, int *batch_count)
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
- write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
+ write_dirty_buffer(journal->j_chkpt_bhs[i], JBD2_JOURNAL_REQ_FLAGS);
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index fcedeb514e14..21f3d029da7d 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -59,9 +59,15 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
*/
inode->i_link[inode->i_size] = '\0';
}
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &jfs_file_inode_operations;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ } else {
+ printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
}
unlock_new_inode(inode);
return inode;
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index ab11849cf9cc..0ab83bb7bbdf 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -2903,7 +2903,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
stbl = DT_GETSTBL(p);
for (i = index; i < p->header.nextindex; i++) {
- if (stbl[i] < 0 || stbl[i] > 127) {
+ if (stbl[i] < 0 || stbl[i] >= DTPAGEMAXSLOT) {
jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld",
i, stbl[i], (long)ip->i_ino, (long long)bn);
free_page(dirent_buf);
@@ -3108,7 +3108,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
/* get the leftmost entry */
stbl = DT_GETSTBL(p);
- if (stbl[0] < 0 || stbl[0] > 127) {
+ if (stbl[0] < 0 || stbl[0] >= DTPAGEMAXSLOT) {
DT_PUTPAGE(mp);
jfs_error(ip->i_sb, "stbl[0] out of bound\n");
return -EIO;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 270808b6219b..b343c5ea1159 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1199,7 +1199,6 @@ static int open_dummy_log(struct super_block *sb)
init_waitqueue_head(&dummy_log->syncwait);
dummy_log->no_integrity = 1;
/* Make up some stuff */
- dummy_log->base = 0;
dummy_log->size = 1024;
rc = lmLogInit(dummy_log);
if (rc) {
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 98f9a432c336..52e6b58c5dbd 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -325,13 +325,13 @@ static int chkSuper(struct super_block *sb)
if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) !=
cpu_to_le32(JFS_BAD_SAIT)) {
expected_AIM_bytesize = 2 * PSIZE;
- AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize;
+ AIM_bytesize = lengthPXD(&j_sb->s_aim2) * bsize;
expected_AIT_bytesize = 4 * PSIZE;
- AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize;
- AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize;
- AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize;
+ AIT_bytesize = lengthPXD(&j_sb->s_ait2) * bsize;
+ AIM_byte_addr = addressPXD(&j_sb->s_aim2) * bsize;
+ AIT_byte_addr = addressPXD(&j_sb->s_ait2) * bsize;
byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr;
- fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize;
+ fsckwsp_addr = addressPXD(&j_sb->s_fsckpxd) * bsize;
byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr;
if ((AIM_bytesize != expected_AIM_bytesize) ||
(AIT_bytesize != expected_AIT_bytesize) ||
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index be17e3c43582..7840a03e5bcb 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -272,14 +272,15 @@ int txInit(void)
if (TxBlock == NULL)
return -ENOMEM;
- for (k = 1; k < nTxBlock - 1; k++) {
- TxBlock[k].next = k + 1;
+ for (k = 0; k < nTxBlock; k++) {
init_waitqueue_head(&TxBlock[k].gcwait);
init_waitqueue_head(&TxBlock[k].waitor);
}
+
+ for (k = 1; k < nTxBlock - 1; k++) {
+ TxBlock[k].next = k + 1;
+ }
TxBlock[k].next = 0;
- init_waitqueue_head(&TxBlock[k].gcwait);
- init_waitqueue_head(&TxBlock[k].waitor);
TxAnchor.freetid = 1;
init_waitqueue_head(&TxAnchor.freewait);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e80262a51884..d68afa196535 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -216,8 +216,7 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
- svc_xprt_destroy_all(serv, net);
- svc_rpcb_cleanup(serv, net);
+ svc_xprt_destroy_all(serv, net, true);
return err;
}
@@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
nlm_shutdown_hosts_net(net);
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
- svc_xprt_destroy_all(serv, net);
- svc_rpcb_cleanup(serv, net);
+ svc_xprt_destroy_all(serv, net, true);
}
} else {
pr_err("%s: no users! net=%x\n",
diff --git a/fs/mount.h b/fs/mount.h
index 79c85639a7ba..f13a28752d0b 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,10 @@ struct mount {
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
- struct list_head mnt_instance; /* mount instance on sb->s_mounts */
+ struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */
+ struct mount * __aligned(1) *mnt_pprev_for_sb;
+ /* except that LSB of pprev is stolen */
+#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
@@ -148,6 +151,11 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
extern seqlock_t mount_lock;
+DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock),
+ write_sequnlock(&mount_lock))
+DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock),
+ read_sequnlock_excl(&mount_lock))
+
struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
@@ -224,4 +232,33 @@ static inline void mnt_notify_add(struct mount *m)
}
#endif
+static inline struct mount *topmost_overmount(struct mount *m)
+{
+ while (m->overmount)
+ m = m->overmount;
+ return m;
+}
+
+static inline bool __test_write_hold(struct mount * __aligned(1) *val)
+{
+ return (unsigned long)val & WRITE_HOLD;
+}
+
+static inline bool test_write_hold(const struct mount *m)
+{
+ return __test_write_hold(m->mnt_pprev_for_sb);
+}
+
+static inline void set_write_hold(struct mount *m)
+{
+ m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
+ | WRITE_HOLD);
+}
+
+static inline void clear_write_hold(struct mount *m)
+{
+ m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
+ & ~WRITE_HOLD);
+}
+
struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index dc01b14c58cd..d82910f33dc4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -91,6 +91,14 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
+static inline void namespace_lock(void);
+static void namespace_unlock(void);
+DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock())
+DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem),
+ up_read(&namespace_sem))
+
+DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T))
+
#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
@@ -363,7 +371,7 @@ out_free_cache:
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
-bool __mnt_is_readonly(struct vfsmount *mnt)
+bool __mnt_is_readonly(const struct vfsmount *mnt)
{
return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
@@ -403,7 +411,7 @@ static unsigned int mnt_get_writers(struct mount *mnt)
#endif
}
-static int mnt_is_readonly(struct vfsmount *mnt)
+static int mnt_is_readonly(const struct vfsmount *mnt)
{
if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
return 1;
@@ -444,31 +452,31 @@ int mnt_get_write_access(struct vfsmount *m)
mnt_inc_writers(mnt);
/*
* The store to mnt_inc_writers must be visible before we pass
- * MNT_WRITE_HOLD loop below, so that the slowpath can see our
- * incremented count after it has set MNT_WRITE_HOLD.
+ * WRITE_HOLD loop below, so that the slowpath can see our
+ * incremented count after it has set WRITE_HOLD.
*/
smp_mb();
might_lock(&mount_lock.lock);
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+ while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
cpu_relax();
} else {
/*
* This prevents priority inversion, if the task
- * setting MNT_WRITE_HOLD got preempted on a remote
+ * setting WRITE_HOLD got preempted on a remote
* CPU, and it prevents life lock if the task setting
- * MNT_WRITE_HOLD has a lower priority and is bound to
+ * WRITE_HOLD has a lower priority and is bound to
* the same CPU as the task that is spinning here.
*/
preempt_enable();
- lock_mount_hash();
- unlock_mount_hash();
+ read_seqlock_excl(&mount_lock);
+ read_sequnlock_excl(&mount_lock);
preempt_disable();
}
}
/*
* The barrier pairs with the barrier sb_start_ro_state_change() making
- * sure that if we see MNT_WRITE_HOLD cleared, we will also see
+ * sure that if we see WRITE_HOLD cleared, we will also see
* s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
* mnt_is_readonly() and bail in case we are racing with remount
* read-only.
@@ -606,16 +614,16 @@ EXPORT_SYMBOL(mnt_drop_write_file);
* a call to mnt_unhold_writers() in order to stop preventing write access to
* @mnt.
*
- * Context: This function expects lock_mount_hash() to be held serializing
- * setting MNT_WRITE_HOLD.
+ * Context: This function expects to be in mount_locked_reader scope serializing
+ * setting WRITE_HOLD.
* Return: On success 0 is returned.
* On error, -EBUSY is returned.
*/
static inline int mnt_hold_writers(struct mount *mnt)
{
- mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
+ set_write_hold(mnt);
/*
- * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+ * After storing WRITE_HOLD, we'll read the counters. This store
* should be visible before we do.
*/
smp_mb();
@@ -631,9 +639,9 @@ static inline int mnt_hold_writers(struct mount *mnt)
* sum up each counter, if we read a counter before it is incremented,
* but then read another CPU's count which it has been subsequently
* decremented from -- we would see more decrements than we should.
- * MNT_WRITE_HOLD protects against this scenario, because
+ * WRITE_HOLD protects against this scenario, because
* mnt_want_write first increments count, then smp_mb, then spins on
- * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+ * WRITE_HOLD, so it can't be decremented by another CPU while
* we're counting up here.
*/
if (mnt_get_writers(mnt) > 0)
@@ -649,19 +657,42 @@ static inline int mnt_hold_writers(struct mount *mnt)
* Stop preventing write access to @mnt allowing callers to gain write access
* to @mnt again.
*
- * This function can only be called after a successful call to
- * mnt_hold_writers().
+ * This function can only be called after a call to mnt_hold_writers().
*
- * Context: This function expects lock_mount_hash() to be held.
+ * Context: This function expects to be in the same mount_locked_reader scope
+ * as the matching mnt_hold_writers().
*/
static inline void mnt_unhold_writers(struct mount *mnt)
{
+ if (!test_write_hold(mnt))
+ return;
/*
- * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+ * MNT_READONLY must become visible before ~WRITE_HOLD, so writers
* that become unheld will see MNT_READONLY.
*/
smp_wmb();
- mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ clear_write_hold(mnt);
+}
+
+static inline void mnt_del_instance(struct mount *m)
+{
+ struct mount **p = m->mnt_pprev_for_sb;
+ struct mount *next = m->mnt_next_for_sb;
+
+ if (next)
+ next->mnt_pprev_for_sb = p;
+ *p = next;
+}
+
+static inline void mnt_add_instance(struct mount *m, struct super_block *s)
+{
+ struct mount *first = s->s_mounts;
+
+ if (first)
+ first->mnt_pprev_for_sb = &m->mnt_next_for_sb;
+ m->mnt_next_for_sb = first;
+ m->mnt_pprev_for_sb = &s->s_mounts;
+ s->s_mounts = m;
}
static int mnt_make_readonly(struct mount *mnt)
@@ -677,17 +708,17 @@ static int mnt_make_readonly(struct mount *mnt)
int sb_prepare_remount_readonly(struct super_block *sb)
{
- struct mount *mnt;
int err = 0;
- /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
+ /* Racy optimization. Recheck the counter under WRITE_HOLD */
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
- lock_mount_hash();
- list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
- if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
- err = mnt_hold_writers(mnt);
+ guard(mount_locked_reader)();
+
+ for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
+ if (!(m->mnt.mnt_flags & MNT_READONLY)) {
+ err = mnt_hold_writers(m);
if (err)
break;
}
@@ -697,11 +728,10 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (!err)
sb_start_ro_state_change(sb);
- list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
- if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
+ if (test_write_hold(m))
+ clear_write_hold(m);
}
- unlock_mount_hash();
return err;
}
@@ -760,24 +790,16 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
}
/**
- * __lookup_mnt - find first child mount
+ * __lookup_mnt - mount hash lookup
* @mnt: parent mount
- * @dentry: mountpoint
- *
- * If @mnt has a child mount @c mounted @dentry find and return it.
+ * @dentry: dentry of mountpoint
*
- * Note that the child mount @c need not be unique. There are cases
- * where shadow mounts are created. For example, during mount
- * propagation when a source mount @mnt whose root got overmounted by a
- * mount @o after path lookup but before @namespace_sem could be
- * acquired gets copied and propagated. So @mnt gets copied including
- * @o. When @mnt is propagated to a destination mount @d that already
- * has another mount @n mounted at the same mountpoint then the source
- * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
- * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
- * on @dentry.
+ * If @mnt has a child mount @c mounted on @dentry find and return it.
+ * Caller must either hold the spinlock component of @mount_lock or
+ * hold rcu_read_lock(), sample the seqcount component before the call
+ * and recheck it afterwards.
*
- * Return: The first child of @mnt mounted @dentry or NULL.
+ * Return: The child of @mnt mounted on @dentry or %NULL.
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -790,21 +812,12 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
return NULL;
}
-/*
- * lookup_mnt - Return the first child mount mounted at path
- *
- * "First" means first mounted chronologically. If you create the
- * following mounts:
- *
- * mount /dev/sda1 /mnt
- * mount /dev/sda2 /mnt
- * mount /dev/sda3 /mnt
- *
- * Then lookup_mnt() on the base /mnt dentry in the root mount will
- * return successively the root dentry and vfsmount of /dev/sda1, then
- * /dev/sda2, then /dev/sda3, then NULL.
+/**
+ * lookup_mnt - Return the child mount mounted at given location
+ * @path: location in the namespace
*
- * lookup_mnt takes a reference to the found vfsmount.
+ * Acquires and returns a new reference to mount at given location
+ * or %NULL if nothing is mounted there.
*/
struct vfsmount *lookup_mnt(const struct path *path)
{
@@ -841,22 +854,20 @@ bool __is_local_mountpoint(const struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mount *mnt, *n;
- bool is_covered = false;
- down_read(&namespace_sem);
- rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
- is_covered = (mnt->mnt_mountpoint == dentry);
- if (is_covered)
- break;
- }
- up_read(&namespace_sem);
+ guard(namespace_shared)();
- return is_covered;
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node)
+ if (mnt->mnt_mountpoint == dentry)
+ return true;
+
+ return false;
}
struct pinned_mountpoint {
struct hlist_node node;
struct mountpoint *mp;
+ struct mount *parent;
};
static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
@@ -947,7 +958,7 @@ static void unpin_mountpoint(struct pinned_mountpoint *m)
}
}
-static inline int check_mnt(struct mount *mnt)
+static inline int check_mnt(const struct mount *mnt)
{
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
@@ -1149,6 +1160,20 @@ static void commit_tree(struct mount *mnt)
touch_mnt_namespace(n);
}
+static void setup_mnt(struct mount *m, struct dentry *root)
+{
+ struct super_block *s = root->d_sb;
+
+ atomic_inc(&s->s_active);
+ m->mnt.mnt_sb = s;
+ m->mnt.mnt_root = dget(root);
+ m->mnt_mountpoint = m->mnt.mnt_root;
+ m->mnt_parent = m;
+
+ guard(mount_locked_reader)();
+ mnt_add_instance(m, s);
+}
+
/**
* vfs_create_mount - Create a mount for a configured superblock
* @fc: The configuration context with the superblock attached
@@ -1172,15 +1197,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
if (fc->sb_flags & SB_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- atomic_inc(&fc->root->d_sb->s_active);
- mnt->mnt.mnt_sb = fc->root->d_sb;
- mnt->mnt.mnt_root = dget(fc->root);
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
+ setup_mnt(mnt, fc->root);
- lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
- unlock_mount_hash();
return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);
@@ -1221,8 +1239,7 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type,
return ERR_CAST(fc);
if (name)
- ret = vfs_parse_fs_string(fc, "source",
- name, strlen(name));
+ ret = vfs_parse_fs_string(fc, "source", name);
if (!ret)
ret = parse_monolithic_mount_data(fc, data);
if (!ret)
@@ -1238,7 +1255,6 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount);
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag)
{
- struct super_block *sb = old->mnt.mnt_sb;
struct mount *mnt;
int err;
@@ -1263,16 +1279,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (mnt->mnt_group_id)
set_mnt_shared(mnt);
- atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
- mnt->mnt.mnt_sb = sb;
- mnt->mnt.mnt_root = dget(root);
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
- lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
- unlock_mount_hash();
+ setup_mnt(mnt, root);
if (flag & CL_PRIVATE) // we are done with it
return mnt;
@@ -1378,7 +1387,7 @@ static void mntput_no_expire(struct mount *mnt)
mnt->mnt.mnt_flags |= MNT_DOOMED;
rcu_read_unlock();
- list_del(&mnt->mnt_instance);
+ mnt_del_instance(mnt);
if (unlikely(!list_empty(&mnt->mnt_expire)))
list_del(&mnt->mnt_expire);
@@ -1719,8 +1728,6 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
-DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
-
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
@@ -1785,6 +1792,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
+ bulk_make_private(&tmp_list);
+
while (!list_empty(&tmp_list)) {
struct mnt_namespace *ns;
bool disconnect;
@@ -1809,7 +1818,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
umount_mnt(p);
}
}
- change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
@@ -1969,10 +1977,11 @@ void __detach_mounts(struct dentry *dentry)
struct pinned_mountpoint mp = {};
struct mount *mnt;
- namespace_lock();
- lock_mount_hash();
+ guard(namespace_excl)();
+ guard(mount_writer)();
+
if (!lookup_mountpoint(dentry, &mp))
- goto out_unlock;
+ return;
event++;
while (mp.node.next) {
@@ -1984,9 +1993,6 @@ void __detach_mounts(struct dentry *dentry)
else umount_tree(mnt, UMOUNT_CONNECTED);
}
unpin_mountpoint(&mp);
-out_unlock:
- unlock_mount_hash();
- namespace_unlock();
}
/*
@@ -2025,7 +2031,7 @@ static int can_umount(const struct path *path, int flags)
}
// caller is responsible for flags being sane
-int path_umount(struct path *path, int flags)
+int path_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
int ret;
@@ -2238,7 +2244,7 @@ static inline bool extend_array(struct path **res, struct path **to_free,
return p;
}
-struct path *collect_paths(const struct path *path,
+const struct path *collect_paths(const struct path *path,
struct path *prealloc, unsigned count)
{
struct mount *root = real_mount(path->mnt);
@@ -2246,7 +2252,7 @@ struct path *collect_paths(const struct path *path,
struct path *res = prealloc, *to_free = NULL;
unsigned n = 0;
- guard(rwsem_read)(&namespace_sem);
+ guard(namespace_shared)();
if (!check_mnt(root))
return ERR_PTR(-EINVAL);
@@ -2272,9 +2278,9 @@ struct path *collect_paths(const struct path *path,
return res;
}
-void drop_collected_paths(struct path *paths, struct path *prealloc)
+void drop_collected_paths(const struct path *paths, const struct path *prealloc)
{
- for (struct path *p = paths; p->mnt; p++)
+ for (const struct path *p = paths; p->mnt; p++)
path_put(p);
if (paths != prealloc)
kfree(paths);
@@ -2301,7 +2307,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
return;
}
- scoped_guard(namespace_lock, &namespace_sem) {
+ scoped_guard(namespace_excl) {
if (!anon_ns_root(m))
return;
@@ -2312,6 +2318,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
}
}
+/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */
static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -2328,12 +2335,8 @@ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
- bool res;
-
- read_seqlock_excl(&mount_lock);
- res = __has_locked_children(mnt, dentry);
- read_sequnlock_excl(&mount_lock);
- return res;
+ guard(mount_locked_reader)();
+ return __has_locked_children(mnt, dentry);
}
/*
@@ -2341,21 +2344,15 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* specified subtree. Such references can act as pins for mount namespaces
* that aren't checked by the mount-cycle checking code, thereby allowing
* cycles to be made.
+ *
+ * locks: mount_locked_reader || namespace_shared && pinned(subtree)
*/
static bool check_for_nsfs_mounts(struct mount *subtree)
{
- struct mount *p;
- bool ret = false;
-
- lock_mount_hash();
- for (p = subtree; p; p = next_mnt(p, subtree))
+ for (struct mount *p = subtree; p; p = next_mnt(p, subtree))
if (mnt_ns_loop(p->mnt.mnt_root))
- goto out;
-
- ret = true;
-out:
- unlock_mount_hash();
- return ret;
+ return false;
+ return true;
}
/**
@@ -2375,7 +2372,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- guard(rwsem_read)(&namespace_sem);
+ guard(namespace_shared)();
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
@@ -2496,8 +2493,7 @@ enum mnt_tree_flags_t {
/**
* attach_recursive_mnt - attach a source mount tree
* @source_mnt: mount tree to be attached
- * @dest_mnt: mount that @source_mnt will be mounted on
- * @dest_mp: the mountpoint @source_mnt will be mounted at
+ * @dest: the context for mounting at the place where the tree should go
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
@@ -2560,10 +2556,11 @@ enum mnt_tree_flags_t {
* Otherwise a negative error code is returned.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
- struct mount *dest_mnt,
- struct mountpoint *dest_mp)
+ const struct pinned_mountpoint *dest)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+ struct mount *dest_mnt = dest->parent;
+ struct mountpoint *dest_mp = dest->mp;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct pinned_mountpoint root = {};
@@ -2643,10 +2640,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
child->mnt_mountpoint);
commit_tree(child);
if (q) {
+ struct mount *r = topmost_overmount(child);
struct mountpoint *mp = root.mp;
- struct mount *r = child;
- while (unlikely(r->overmount))
- r = r->overmount;
+
if (unlikely(shorter) && child != source_mnt)
mp = shorter;
mnt_change_mountpoint(r, mp, q);
@@ -2675,110 +2671,120 @@ static int attach_recursive_mnt(struct mount *source_mnt,
return err;
}
+static inline struct mount *where_to_mount(const struct path *path,
+ struct dentry **dentry,
+ bool beneath)
+{
+ struct mount *m;
+
+ if (unlikely(beneath)) {
+ m = topmost_overmount(real_mount(path->mnt));
+ *dentry = m->mnt_mountpoint;
+ return m->mnt_parent;
+ }
+ m = __lookup_mnt(path->mnt, path->dentry);
+ if (unlikely(m)) {
+ m = topmost_overmount(m);
+ *dentry = m->mnt.mnt_root;
+ return m;
+ }
+ *dentry = path->dentry;
+ return real_mount(path->mnt);
+}
+
/**
- * do_lock_mount - lock mount and mountpoint
- * @path: target path
- * @beneath: whether the intention is to mount beneath @path
- *
- * Follow the mount stack on @path until the top mount @mnt is found. If
- * the initial @path->{mnt,dentry} is a mountpoint lookup the first
- * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
- * until nothing is stacked on top of it anymore.
+ * do_lock_mount - acquire environment for mounting
+ * @path: target path
+ * @res: context to set up
+ * @beneath: whether the intention is to mount beneath @path
*
- * Acquire the inode_lock() on the top mount's ->mnt_root to protect
- * against concurrent removal of the new mountpoint from another mount
- * namespace.
+ * To mount something at given location, we need
+ * namespace_sem locked exclusive
+ * inode of dentry we are mounting on locked exclusive
+ * struct mountpoint for that dentry
+ * struct mount we are mounting on
*
- * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
- * @mp on @mnt->mnt_parent must be acquired. This protects against a
- * concurrent unlink of @mp->mnt_dentry from another mount namespace
- * where @mnt doesn't have a child mount mounted @mp. A concurrent
- * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
- * on top of it for @beneath.
+ * Results are stored in caller-supplied context (pinned_mountpoint);
+ * on success we have res->parent and res->mp pointing to parent and
+ * mountpoint respectively and res->node inserted into the ->m_list
+ * of the mountpoint, making sure the mountpoint won't disappear.
+ * On failure we have res->parent set to ERR_PTR(-E...), res->mp
+ * left NULL, res->node - empty.
+ * In case of success do_lock_mount returns with locks acquired (in
+ * proper order - inode lock nests outside of namespace_sem).
*
- * In addition, @beneath needs to make sure that @mnt hasn't been
- * unmounted or moved from its current mountpoint in between dropping
- * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
- * being unmounted would be detected later by e.g., calling
- * check_mnt(mnt) in the function it's called from. For the @beneath
- * case however, it's useful to detect it directly in do_lock_mount().
- * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
- * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
- * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
+ * Request to mount on overmounted location is treated as "mount on
+ * top of whatever's overmounting it"; request to mount beneath
+ * a location - "mount immediately beneath the topmost mount at that
+ * place".
*
- * Return: Either the target mountpoint on the top mount or the top
- * mount's mountpoint.
+ * In all cases the location must not have been unmounted and the
+ * chosen mountpoint must be allowed to be mounted on. For "beneath"
+ * case we also require the location to be at the root of a mount
+ * that has a parent (i.e. is not a root of some namespace).
*/
-static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath)
+static void do_lock_mount(const struct path *path,
+ struct pinned_mountpoint *res,
+ bool beneath)
{
- struct vfsmount *mnt = path->mnt;
- struct dentry *dentry;
- struct path under = {};
- int err = -ENOENT;
+ int err;
- for (;;) {
- struct mount *m = real_mount(mnt);
+ if (unlikely(beneath) && !path_mounted(path)) {
+ res->parent = ERR_PTR(-EINVAL);
+ return;
+ }
- if (beneath) {
- path_put(&under);
- read_seqlock_excl(&mount_lock);
- under.mnt = mntget(&m->mnt_parent->mnt);
- under.dentry = dget(m->mnt_mountpoint);
- read_sequnlock_excl(&mount_lock);
- dentry = under.dentry;
- } else {
- dentry = path->dentry;
+ do {
+ struct dentry *dentry, *d;
+ struct mount *m, *n;
+
+ scoped_guard(mount_locked_reader) {
+ m = where_to_mount(path, &dentry, beneath);
+ if (&m->mnt != path->mnt) {
+ mntget(&m->mnt);
+ dget(dentry);
+ }
}
inode_lock(dentry->d_inode);
namespace_lock();
- if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
- break; // not to be mounted on
+ // check if the chain of mounts (if any) has changed.
+ scoped_guard(mount_locked_reader)
+ n = where_to_mount(path, &d, beneath);
- if (beneath && unlikely(m->mnt_mountpoint != dentry ||
- &m->mnt_parent->mnt != under.mnt)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- continue; // got moved
- }
+ if (unlikely(n != m || dentry != d))
+ err = -EAGAIN; // something moved, retry
+ else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt)))
+ err = -ENOENT; // not to be mounted on
+ else if (beneath && &m->mnt == path->mnt && !m->overmount)
+ err = -EINVAL;
+ else
+ err = get_mountpoint(dentry, res);
- mnt = lookup_mnt(path);
- if (unlikely(mnt)) {
+ if (unlikely(err)) {
+ res->parent = ERR_PTR(err);
namespace_unlock();
inode_unlock(dentry->d_inode);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- continue; // got overmounted
+ } else {
+ res->parent = m;
}
- err = get_mountpoint(dentry, pinned);
- if (err)
- break;
- if (beneath) {
- /*
- * @under duplicates the references that will stay
- * at least until namespace_unlock(), so the path_put()
- * below is safe (and OK to do under namespace_lock -
- * we are not dropping the final references here).
- */
- path_put(&under);
+ /*
+ * Drop the temporary references. This is subtle - on success
+ * we are doing that under namespace_sem, which would normally
+ * be forbidden. However, in that case we are guaranteed that
+ * refcounts won't reach zero, since we know that path->mnt
+ * is mounted and thus all mounts reachable from it are pinned
+ * and stable, along with their mountpoints and roots.
+ */
+ if (&m->mnt != path->mnt) {
+ dput(dentry);
+ mntput(&m->mnt);
}
- return 0;
- }
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- path_put(&under);
- return err;
-}
-
-static inline int lock_mount(struct path *path, struct pinned_mountpoint *m)
-{
- return do_lock_mount(path, m, false);
+ } while (err == -EAGAIN);
}
-static void unlock_mount(struct pinned_mountpoint *m)
+static void __unlock_mount(struct pinned_mountpoint *m)
{
inode_unlock(m->mp->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
@@ -2787,16 +2793,30 @@ static void unlock_mount(struct pinned_mountpoint *m)
namespace_unlock();
}
-static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
+static inline void unlock_mount(struct pinned_mountpoint *m)
+{
+ if (!IS_ERR(m->parent))
+ __unlock_mount(m);
+}
+
+#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
+ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+ do_lock_mount((path), &mp, (beneath))
+#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
+#define LOCK_MOUNT_EXACT(mp, path) \
+ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+ lock_mount_exact((path), &mp)
+
+static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
{
if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
return -EINVAL;
- if (d_is_dir(mp->m_dentry) !=
+ if (d_is_dir(mp->mp->m_dentry) !=
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
- return attach_recursive_mnt(mnt, p, mp);
+ return attach_recursive_mnt(mnt, mp);
}
static int may_change_propagation(const struct mount *m)
@@ -2832,13 +2852,13 @@ static int flags_to_propagation_type(int ms_flags)
/*
* recursively change the type of the mountpoint.
*/
-static int do_change_type(struct path *path, int ms_flags)
+static int do_change_type(const struct path *path, int ms_flags)
{
struct mount *m;
struct mount *mnt = real_mount(path->mnt);
int recurse = ms_flags & MS_REC;
int type;
- int err = 0;
+ int err;
if (!path_mounted(path))
return -EINVAL;
@@ -2847,23 +2867,22 @@ static int do_change_type(struct path *path, int ms_flags)
if (!type)
return -EINVAL;
- namespace_lock();
+ guard(namespace_excl)();
+
err = may_change_propagation(mnt);
if (err)
- goto out_unlock;
+ return err;
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
- goto out_unlock;
+ return err;
}
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- out_unlock:
- namespace_unlock();
- return err;
+ return 0;
}
/* may_copy_tree() - check if a mount tree can be copied
@@ -2909,7 +2928,7 @@ static int do_change_type(struct path *path, int ms_flags)
*
* Returns true if the mount tree can be copied, false otherwise.
*/
-static inline bool may_copy_tree(struct path *path)
+static inline bool may_copy_tree(const struct path *path)
{
struct mount *mnt = real_mount(path->mnt);
const struct dentry_operations *d_op;
@@ -2931,7 +2950,7 @@ static inline bool may_copy_tree(struct path *path)
}
-static struct mount *__do_loopback(struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path, int recurse)
{
struct mount *old = real_mount(old_path->mnt);
@@ -2953,12 +2972,11 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
/*
* do loopback mount.
*/
-static int do_loopback(struct path *path, const char *old_name,
- int recurse)
+static int do_loopback(const struct path *path, const char *old_name,
+ int recurse)
{
- struct path old_path;
- struct mount *mnt = NULL, *parent;
- struct pinned_mountpoint mp = {};
+ struct path old_path __free(path_put) = {};
+ struct mount *mnt = NULL;
int err;
if (!old_name || !*old_name)
return -EINVAL;
@@ -2966,49 +2984,40 @@ static int do_loopback(struct path *path, const char *old_name,
if (err)
return err;
- err = -EINVAL;
if (mnt_ns_loop(old_path.dentry))
- goto out;
+ return -EINVAL;
- err = lock_mount(path, &mp);
- if (err)
- goto out;
+ LOCK_MOUNT(mp, path);
+ if (IS_ERR(mp.parent))
+ return PTR_ERR(mp.parent);
- parent = real_mount(path->mnt);
- if (!check_mnt(parent))
- goto out2;
+ if (!check_mnt(mp.parent))
+ return -EINVAL;
mnt = __do_loopback(&old_path, recurse);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- goto out2;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
- err = graft_tree(mnt, parent, mp.mp);
+ err = graft_tree(mnt, &mp);
if (err) {
lock_mount_hash();
umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
-out2:
- unlock_mount(&mp);
-out:
- path_put(&old_path);
return err;
}
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
{
struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
struct user_namespace *user_ns = mnt_ns->user_ns;
struct mount *mnt, *p;
- struct file *file;
ns = alloc_mnt_ns(user_ns, true);
if (IS_ERR(ns))
- return ERR_CAST(ns);
+ return ns;
- namespace_lock();
+ guard(namespace_excl)();
/*
* Record the sequence number of the source mount namespace.
@@ -3025,23 +3034,28 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
- namespace_unlock();
- free_mnt_ns(ns);
+ emptied_ns = ns;
return ERR_CAST(mnt);
}
- lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
mnt_add_to_ns(ns, p);
ns->nr_mounts++;
}
ns->root = mnt;
- mntget(&mnt->mnt);
- unlock_mount_hash();
- namespace_unlock();
+ return ns;
+}
+
+static struct file *open_detached_copy(struct path *path, bool recursive)
+{
+ struct mnt_namespace *ns = get_detached_copy(path, recursive);
+ struct file *file;
+
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
mntput(path->mnt);
- path->mnt = &mnt->mnt;
+ path->mnt = mntget(&ns->root->mnt);
file = dentry_open(path, O_PATH, current_cred());
if (IS_ERR(file))
dissolve_on_fput(path->mnt);
@@ -3158,7 +3172,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
touch_mnt_namespace(mnt->mnt_ns);
}
-static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
+static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
+ struct vfsmount *mnt)
{
struct super_block *sb = mnt->mnt_sb;
@@ -3192,7 +3207,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
* superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
* to mount(2).
*/
-static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
+static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags)
{
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
@@ -3229,7 +3244,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
* If you've mounted a non-root directory somewhere and want to do remount
* on it - tough luck.
*/
-static int do_remount(struct path *path, int sb_flags,
+static int do_remount(const struct path *path, int sb_flags,
int mnt_flags, void *data)
{
int err;
@@ -3287,49 +3302,46 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-static int do_set_group(struct path *from_path, struct path *to_path)
+static int do_set_group(const struct path *from_path, const struct path *to_path)
{
- struct mount *from, *to;
+ struct mount *from = real_mount(from_path->mnt);
+ struct mount *to = real_mount(to_path->mnt);
int err;
- from = real_mount(from_path->mnt);
- to = real_mount(to_path->mnt);
-
- namespace_lock();
+ guard(namespace_excl)();
err = may_change_propagation(from);
if (err)
- goto out;
+ return err;
err = may_change_propagation(to);
if (err)
- goto out;
+ return err;
- err = -EINVAL;
/* To and From paths should be mount roots */
if (!path_mounted(from_path))
- goto out;
+ return -EINVAL;
if (!path_mounted(to_path))
- goto out;
+ return -EINVAL;
/* Setting sharing groups is only allowed across same superblock */
if (from->mnt.mnt_sb != to->mnt.mnt_sb)
- goto out;
+ return -EINVAL;
/* From mount root should be wider than To mount root */
if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
- goto out;
+ return -EINVAL;
/* From mount should not have locked children in place of To's root */
if (__has_locked_children(from, to->mnt.mnt_root))
- goto out;
+ return -EINVAL;
/* Setting sharing groups is only allowed on private mounts */
if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
- goto out;
+ return -EINVAL;
/* From should not be private */
if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
- goto out;
+ return -EINVAL;
if (IS_MNT_SLAVE(from)) {
hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
@@ -3341,11 +3353,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
list_add(&to->mnt_share, &from->mnt_share);
set_mnt_shared(to);
}
-
- err = 0;
-out:
- namespace_unlock();
- return err;
+ return 0;
}
/**
@@ -3389,17 +3397,15 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
/**
* can_move_mount_beneath - check that we can mount beneath the top mount
- * @from: mount to mount beneath
- * @to: mount under which to mount
- * @mp: mountpoint of @to
+ * @mnt_from: mount we are trying to move
+ * @mnt_to: mount under which to mount
+ * @mp: mountpoint of @mnt_to
*
- * - Make sure that @to->dentry is actually the root of a mount under
- * which we can mount another mount.
* - Make sure that nothing can be mounted beneath the caller's current
* root or the rootfs of the namespace.
* - Make sure that the caller can unmount the topmost mount ensuring
* that the caller could reveal the underlying mountpoint.
- * - Ensure that nothing has been mounted on top of @from before we
+ * - Ensure that nothing has been mounted on top of @mnt_from before we
* grabbed @namespace_sem to avoid creating pointless shadow mounts.
* - Prevent mounting beneath a mount if the propagation relationship
* between the source mount, parent mount, and top mount would lead to
@@ -3408,25 +3414,17 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
* Context: This function expects namespace_lock() to be held.
* Return: On success 0, and on error a negative error code is returned.
*/
-static int can_move_mount_beneath(const struct path *from,
- const struct path *to,
+static int can_move_mount_beneath(const struct mount *mnt_from,
+ const struct mount *mnt_to,
const struct mountpoint *mp)
{
- struct mount *mnt_from = real_mount(from->mnt),
- *mnt_to = real_mount(to->mnt),
- *parent_mnt_to = mnt_to->mnt_parent;
-
- if (!mnt_has_parent(mnt_to))
- return -EINVAL;
-
- if (!path_mounted(to))
- return -EINVAL;
+ struct mount *parent_mnt_to = mnt_to->mnt_parent;
if (IS_MNT_LOCKED(mnt_to))
return -EINVAL;
/* Avoid creating shadow mounts during mount propagation. */
- if (path_overmounted(from))
+ if (mnt_from->overmount)
return -EINVAL;
/*
@@ -3517,97 +3515,83 @@ static inline bool may_use_mount(struct mount *mnt)
return check_anonymous_mnt(mnt);
}
-static int do_move_mount(struct path *old_path,
- struct path *new_path, enum mnt_tree_flags_t flags)
+static int do_move_mount(const struct path *old_path,
+ const struct path *new_path,
+ enum mnt_tree_flags_t flags)
{
- struct mnt_namespace *ns;
- struct mount *p;
- struct mount *old;
- struct mount *parent;
- struct pinned_mountpoint mp;
+ struct mount *old = real_mount(old_path->mnt);
int err;
bool beneath = flags & MNT_TREE_BENEATH;
- err = do_lock_mount(new_path, &mp, beneath);
- if (err)
- return err;
+ if (!path_mounted(old_path))
+ return -EINVAL;
- old = real_mount(old_path->mnt);
- p = real_mount(new_path->mnt);
- parent = old->mnt_parent;
- ns = old->mnt_ns;
+ if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry))
+ return -EINVAL;
- err = -EINVAL;
+ LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath);
+ if (IS_ERR(mp.parent))
+ return PTR_ERR(mp.parent);
if (check_mnt(old)) {
/* if the source is in our namespace... */
/* ... it should be detachable from parent */
if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
- goto out;
+ return -EINVAL;
+ /* ... which should not be shared */
+ if (IS_MNT_SHARED(old->mnt_parent))
+ return -EINVAL;
/* ... and the target should be in our namespace */
- if (!check_mnt(p))
- goto out;
- /* parent of the source should not be shared */
- if (IS_MNT_SHARED(parent))
- goto out;
+ if (!check_mnt(mp.parent))
+ return -EINVAL;
} else {
/*
* otherwise the source must be the root of some anon namespace.
*/
if (!anon_ns_root(old))
- goto out;
+ return -EINVAL;
/*
* Bail out early if the target is within the same namespace -
* subsequent checks would've rejected that, but they lose
* some corner cases if we check it early.
*/
- if (ns == p->mnt_ns)
- goto out;
+ if (old->mnt_ns == mp.parent->mnt_ns)
+ return -EINVAL;
/*
* Target should be either in our namespace or in an acceptable
* anon namespace, sensu check_anonymous_mnt().
*/
- if (!may_use_mount(p))
- goto out;
+ if (!may_use_mount(mp.parent))
+ return -EINVAL;
}
- if (!path_mounted(old_path))
- goto out;
-
- if (d_is_dir(new_path->dentry) !=
- d_is_dir(old_path->dentry))
- goto out;
-
if (beneath) {
- err = can_move_mount_beneath(old_path, new_path, mp.mp);
- if (err)
- goto out;
+ struct mount *over = real_mount(new_path->mnt);
- err = -EINVAL;
- p = p->mnt_parent;
+ if (mp.parent != over->mnt_parent)
+ over = mp.parent->overmount;
+ err = can_move_mount_beneath(old, over, mp.mp);
+ if (err)
+ return err;
}
/*
* Don't move a mount tree containing unbindable mounts to a destination
* mount which is shared.
*/
- if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
- goto out;
- err = -ELOOP;
+ if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old))
+ return -EINVAL;
if (!check_for_nsfs_mounts(old))
- goto out;
- if (mount_is_ancestor(old, p))
- goto out;
+ return -ELOOP;
+ if (mount_is_ancestor(old, mp.parent))
+ return -ELOOP;
- err = attach_recursive_mnt(old, p, mp.mp);
-out:
- unlock_mount(&mp);
- return err;
+ return attach_recursive_mnt(old, &mp);
}
-static int do_move_mount_old(struct path *path, const char *old_name)
+static int do_move_mount_old(const struct path *path, const char *old_name)
{
- struct path old_path;
+ struct path old_path __free(path_put) = {};
int err;
if (!old_name || !*old_name)
@@ -3617,18 +3601,19 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path, 0);
- path_put(&old_path);
- return err;
+ return do_move_mount(&old_path, path, 0);
}
/*
* add a mount into a namespace's mount tree
*/
-static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
- const struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp,
+ int mnt_flags)
{
- struct mount *parent = real_mount(path->mnt);
+ struct mount *parent = mp->parent;
+
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
mnt_flags &= ~MNT_INTERNAL_FLAGS;
@@ -3642,14 +3627,15 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
}
/* Refuse the same filesystem on the same mount point */
- if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
+ if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb &&
+ parent->mnt.mnt_root == mp->mp->m_dentry)
return -EBUSY;
if (d_is_symlink(newmnt->mnt.mnt_root))
return -EINVAL;
newmnt->mnt.mnt_flags = mnt_flags;
- return graft_tree(newmnt, parent, mp);
+ return graft_tree(newmnt, mp);
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
@@ -3658,41 +3644,32 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
* Create a new mount using a superblock configuration and request it
* be added to the namespace tree.
*/
-static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
+static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint,
unsigned int mnt_flags)
{
- struct vfsmount *mnt;
- struct pinned_mountpoint mp = {};
- struct super_block *sb = fc->root->d_sb;
+ struct super_block *sb;
+ struct vfsmount *mnt __free(mntput) = fc_mount(fc);
int error;
- error = security_sb_kern_mount(sb);
- if (!error && mount_too_revealing(sb, &mnt_flags)) {
- errorfcp(fc, "VFS", "Mount too revealing");
- error = -EPERM;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
- if (unlikely(error)) {
- fc_drop_locked(fc);
+ sb = fc->root->d_sb;
+ error = security_sb_kern_mount(sb);
+ if (unlikely(error))
return error;
- }
- up_write(&sb->s_umount);
-
- mnt = vfs_create_mount(fc);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
+ if (unlikely(mount_too_revealing(sb, &mnt_flags))) {
+ errorfcp(fc, "VFS", "Mount too revealing");
+ return -EPERM;
+ }
mnt_warn_timestamp_expiry(mountpoint, mnt);
- error = lock_mount(mountpoint, &mp);
- if (!error) {
- error = do_add_mount(real_mount(mnt), mp.mp,
- mountpoint, mnt_flags);
- unlock_mount(&mp);
- }
- if (error < 0)
- mntput(mnt);
+ LOCK_MOUNT(mp, mountpoint);
+ error = do_add_mount(real_mount(mnt), &mp, mnt_flags);
+ if (!error)
+ retain_and_null_ptr(mnt); // consumed on success
return error;
}
@@ -3700,8 +3677,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
-static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
- int mnt_flags, const char *name, void *data)
+static int do_new_mount(const struct path *path, const char *fstype,
+ int sb_flags, int mnt_flags,
+ const char *name, void *data)
{
struct file_system_type *type;
struct fs_context *fc;
@@ -3738,27 +3716,46 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
fc->oldapi = true;
if (subtype)
- err = vfs_parse_fs_string(fc, "subtype",
- subtype, strlen(subtype));
+ err = vfs_parse_fs_string(fc, "subtype", subtype);
if (!err && name)
- err = vfs_parse_fs_string(fc, "source", name, strlen(name));
+ err = vfs_parse_fs_string(fc, "source", name);
if (!err)
err = parse_monolithic_mount_data(fc, data);
if (!err && !mount_capable(fc))
err = -EPERM;
if (!err)
- err = vfs_get_tree(fc);
- if (!err)
err = do_new_mount_fc(fc, path, mnt_flags);
put_fs_context(fc);
return err;
}
-int finish_automount(struct vfsmount *m, const struct path *path)
+static void lock_mount_exact(const struct path *path,
+ struct pinned_mountpoint *mp)
{
struct dentry *dentry = path->dentry;
- struct pinned_mountpoint mp = {};
+ int err;
+
+ inode_lock(dentry->d_inode);
+ namespace_lock();
+ if (unlikely(cant_mount(dentry)))
+ err = -ENOENT;
+ else if (path_overmounted(path))
+ err = -EBUSY;
+ else
+ err = get_mountpoint(dentry, mp);
+ if (unlikely(err)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ mp->parent = ERR_PTR(err);
+ } else {
+ mp->parent = real_mount(path->mnt);
+ }
+}
+
+int finish_automount(struct vfsmount *__m, const struct path *path)
+{
+ struct vfsmount *m __free(mntput) = __m;
struct mount *mnt;
int err;
@@ -3769,43 +3766,21 @@ int finish_automount(struct vfsmount *m, const struct path *path)
mnt = real_mount(m);
- if (m->mnt_sb == path->mnt->mnt_sb &&
- m->mnt_root == dentry) {
- err = -ELOOP;
- goto discard;
- }
+ if (m->mnt_root == path->dentry)
+ return -ELOOP;
/*
- * we don't want to use lock_mount() - in this case finding something
+ * we don't want to use LOCK_MOUNT() - in this case finding something
* that overmounts our mountpoint to be means "quitely drop what we've
* got", not "try to mount it on top".
*/
- inode_lock(dentry->d_inode);
- namespace_lock();
- if (unlikely(cant_mount(dentry))) {
- err = -ENOENT;
- goto discard_locked;
- }
- if (path_overmounted(path)) {
- err = 0;
- goto discard_locked;
- }
- err = get_mountpoint(dentry, &mp);
- if (err)
- goto discard_locked;
-
- err = do_add_mount(mnt, mp.mp, path,
- path->mnt->mnt_flags | MNT_SHRINKABLE);
- unlock_mount(&mp);
- if (unlikely(err))
- goto discard;
- return 0;
+ LOCK_MOUNT_EXACT(mp, path);
+ if (mp.parent == ERR_PTR(-EBUSY))
+ return 0;
-discard_locked:
- namespace_unlock();
- inode_unlock(dentry->d_inode);
-discard:
- mntput(m);
+ err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE);
+ if (likely(!err))
+ retain_and_null_ptr(m);
return err;
}
@@ -3816,9 +3791,8 @@ discard:
*/
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
- read_seqlock_excl(&mount_lock);
+ guard(mount_locked_reader)();
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
- read_sequnlock_excl(&mount_lock);
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -3835,8 +3809,8 @@ void mark_mounts_for_expiry(struct list_head *mounts)
if (list_empty(mounts))
return;
- namespace_lock();
- lock_mount_hash();
+ guard(namespace_excl)();
+ guard(mount_writer)();
/* extract from the expiration list every vfsmount that matches the
* following criteria:
@@ -3858,8 +3832,6 @@ void mark_mounts_for_expiry(struct list_head *mounts)
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
- unlock_mount_hash();
- namespace_unlock();
}
EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -3987,7 +3959,7 @@ static char *copy_mount_string(const void __user *data)
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
-int path_mount(const char *dev_name, struct path *path,
+int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page)
{
unsigned int mnt_flags = 0, sb_flags;
@@ -4069,15 +4041,13 @@ int path_mount(const char *dev_name, struct path *path,
int do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
- struct path path;
+ struct path path __free(path_put) = {};
int ret;
ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
if (ret)
return ret;
- ret = path_mount(dev_name, &path, type_page, flags, data_page);
- path_put(&path);
- return ret;
+ return path_mount(dev_name, &path, type_page, flags, data_page);
}
static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
@@ -4138,7 +4108,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
- struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
+ struct vfsmount *rootmnt __free(mntput) = NULL;
+ struct vfsmount *pwdmnt __free(mntput) = NULL;
struct mount *p, *q;
struct mount *old;
struct mount *new;
@@ -4157,23 +4128,19 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
if (IS_ERR(new_ns))
return new_ns;
- namespace_lock();
+ guard(namespace_excl)();
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
copy_flags |= CL_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
- namespace_unlock();
- ns_common_free(ns);
- dec_mnt_namespaces(new_ns->ucounts);
- mnt_ns_release(new_ns);
+ emptied_ns = new_ns;
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
- lock_mount_hash();
+ guard(mount_writer)();
lock_mnt_tree(new);
- unlock_mount_hash();
}
new_ns->root = new;
@@ -4205,13 +4172,6 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
- namespace_unlock();
-
- if (rootmnt)
- mntput(rootmnt);
- if (pwdmnt)
- mntput(pwdmnt);
-
ns_tree_add_raw(new_ns);
return new_ns;
}
@@ -4436,7 +4396,8 @@ err_unlock:
return ret;
}
-static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
+static inline int vfs_move_mount(const struct path *from_path,
+ const struct path *to_path,
enum mnt_tree_flags_t mflags)
{
int ret;
@@ -4542,7 +4503,7 @@ SYSCALL_DEFINE5(move_mount,
/*
* Return true if path is reachable from root
*
- * namespace_sem or mount_lock is held
+ * locks: mount_locked_reader || namespace_shared && is_mounted(mnt)
*/
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
const struct path *root)
@@ -4556,11 +4517,8 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
bool path_is_under(const struct path *path1, const struct path *path2)
{
- bool res;
- read_seqlock_excl(&mount_lock);
- res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
- read_sequnlock_excl(&mount_lock);
- return res;
+ guard(mount_locked_reader)();
+ return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
}
EXPORT_SYMBOL(path_is_under);
@@ -4592,9 +4550,10 @@ EXPORT_SYMBOL(path_is_under);
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
- struct path new, old, root;
+ struct path new __free(path_put) = {};
+ struct path old __free(path_put) = {};
+ struct path root __free(path_put) = {};
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
- struct pinned_mountpoint old_mp = {};
int error;
if (!may_mount())
@@ -4603,57 +4562,54 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
error = user_path_at(AT_FDCWD, new_root,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
if (error)
- goto out0;
+ return error;
error = user_path_at(AT_FDCWD, put_old,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
if (error)
- goto out1;
+ return error;
error = security_sb_pivotroot(&old, &new);
if (error)
- goto out2;
+ return error;
get_fs_root(current->fs, &root);
- error = lock_mount(&old, &old_mp);
- if (error)
- goto out3;
- error = -EINVAL;
+ LOCK_MOUNT(old_mp, &old);
+ old_mnt = old_mp.parent;
+ if (IS_ERR(old_mnt))
+ return PTR_ERR(old_mnt);
+
new_mnt = real_mount(new.mnt);
root_mnt = real_mount(root.mnt);
- old_mnt = real_mount(old.mnt);
ex_parent = new_mnt->mnt_parent;
root_parent = root_mnt->mnt_parent;
if (IS_MNT_SHARED(old_mnt) ||
IS_MNT_SHARED(ex_parent) ||
IS_MNT_SHARED(root_parent))
- goto out4;
+ return -EINVAL;
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
- goto out4;
+ return -EINVAL;
if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
- goto out4;
- error = -ENOENT;
+ return -EINVAL;
if (d_unlinked(new.dentry))
- goto out4;
- error = -EBUSY;
+ return -ENOENT;
if (new_mnt == root_mnt || old_mnt == root_mnt)
- goto out4; /* loop, on the same file system */
- error = -EINVAL;
+ return -EBUSY; /* loop, on the same file system */
if (!path_mounted(&root))
- goto out4; /* not a mountpoint */
+ return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
- goto out4; /* absolute root */
+ return -EINVAL; /* absolute root */
if (!path_mounted(&new))
- goto out4; /* not a mountpoint */
+ return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
- goto out4; /* absolute root */
+ return -EINVAL; /* absolute root */
/* make sure we can reach put_old from new_root */
- if (!is_path_reachable(old_mnt, old.dentry, &new))
- goto out4;
+ if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new))
+ return -EINVAL;
/* make certain new is below the root */
if (!is_path_reachable(new_mnt, new.dentry, &root))
- goto out4;
+ return -EINVAL;
lock_mount_hash();
umount_mnt(new_mnt);
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
@@ -4672,17 +4628,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
mnt_notify_add(root_mnt);
mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
- error = 0;
-out4:
- unlock_mount(&old_mp);
-out3:
- path_put(&root);
-out2:
- path_put(&old);
-out1:
- path_put(&new);
-out0:
- return error;
+ return 0;
}
static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
@@ -4772,8 +4718,10 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
if (!mnt_allow_writers(kattr, m)) {
err = mnt_hold_writers(m);
- if (err)
+ if (err) {
+ m = next_mnt(m, mnt);
break;
+ }
}
if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
@@ -4781,25 +4729,9 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
}
if (err) {
- struct mount *p;
-
- /*
- * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
- * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
- * mounts and needs to take care to include the first mount.
- */
- for (p = mnt; p; p = next_mnt(p, mnt)) {
- /* If we had to hold writers unblock them. */
- if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt_unhold_writers(p);
-
- /*
- * We're done once the first mount we changed got
- * MNT_WRITE_HOLD unset.
- */
- if (p == m)
- break;
- }
+ /* undo all mnt_hold_writers() we'd done */
+ for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt))
+ mnt_unhold_writers(p);
}
return err;
}
@@ -4830,8 +4762,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
WRITE_ONCE(m->mnt.mnt_flags, flags);
/* If we had to hold writers unblock them. */
- if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt_unhold_writers(m);
+ mnt_unhold_writers(m);
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
@@ -4841,7 +4772,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
touch_mnt_namespace(mnt->mnt_ns);
}
-static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
+static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr)
{
struct mount *mnt = real_mount(path->mnt);
int err = 0;
@@ -5639,6 +5570,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
STATMOUNT_MNT_UIDMAP | \
STATMOUNT_MNT_GIDMAP)
+/* locks: namespace_shared */
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5885,7 +5817,7 @@ retry:
if (ret)
return ret;
- scoped_guard(rwsem_read, &namespace_sem)
+ scoped_guard(namespace_shared)
ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
if (!ret)
@@ -5906,6 +5838,7 @@ struct klistmount {
struct path root;
};
+/* locks: namespace_shared */
static ssize_t do_listmount(struct klistmount *kls, bool reverse)
{
struct mnt_namespace *ns = kls->ns;
@@ -6040,7 +5973,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
* We only need to guard against mount topology changes as
* listmount() doesn't care about any mount properties.
*/
- scoped_guard(rwsem_read, &namespace_sem)
+ scoped_guard(namespace_shared)
ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
if (ret <= 0)
return ret;
@@ -6127,12 +6060,10 @@ void put_mnt_ns(struct mnt_namespace *ns)
{
if (!ns_ref_put(ns))
return;
- namespace_lock();
+ guard(namespace_excl)();
emptied_ns = ns;
- lock_mount_hash();
+ guard(mount_writer)();
umount_tree(ns->root, 0);
- unlock_mount_hash();
- namespace_unlock();
}
struct vfsmount *kern_mount(struct file_system_type *type)
@@ -6181,25 +6112,18 @@ bool our_mnt(struct vfsmount *mnt)
bool current_chrooted(void)
{
/* Does the current process have a non-standard root */
- struct path ns_root;
- struct path fs_root;
- bool chrooted;
-
- /* Find the namespace root */
- ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
- ns_root.dentry = ns_root.mnt->mnt_root;
- path_get(&ns_root);
- while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
- ;
+ struct path fs_root __free(path_put) = {};
+ struct mount *root;
get_fs_root(current->fs, &fs_root);
- chrooted = !path_equal(&fs_root, &ns_root);
+ /* Find the namespace root */
+
+ guard(mount_locked_reader)();
- path_put(&fs_root);
- path_put(&ns_root);
+ root = topmost_overmount(current->nsproxy->mnt_ns->root);
- return chrooted;
+ return fs_root.mnt != &root->mnt || !path_mounted(&fs_root);
}
static bool mnt_already_visible(struct mnt_namespace *ns,
@@ -6208,9 +6132,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
{
int new_flags = *new_mnt_flags;
struct mount *mnt, *n;
- bool visible = false;
- down_read(&namespace_sem);
+ guard(namespace_shared)();
rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
struct mount *child;
int mnt_flags;
@@ -6257,13 +6180,10 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
MNT_LOCK_ATIME);
- visible = true;
- goto found;
+ return true;
next: ;
}
-found:
- up_read(&namespace_sem);
- return visible;
+ return false;
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5d6edafbed20..0e4c67373e4f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -676,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
struct pnfs_layout_segment *lseg;
struct xdr_buf buf;
struct xdr_stream xdr;
- struct page *scratch;
+ struct folio *scratch;
int status, i;
uint32_t count;
__be32 *p;
@@ -689,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
return ERR_PTR(-ENOMEM);
status = -ENOMEM;
- scratch = alloc_page(gfp_mask);
+ scratch = folio_alloc(gfp_mask, 0);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf,
lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_page(&xdr, scratch);
+ xdr_set_scratch_folio(&xdr, scratch);
status = -EIO;
p = xdr_inline_decode(&xdr, 4);
@@ -744,7 +744,7 @@ process_extents:
}
out_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out:
dprintk("%s returns %d\n", __func__, status);
switch (status) {
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 44306ac22353..ab76120705e2 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -541,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct pnfs_block_dev *top;
struct xdr_stream xdr;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
int nr_volumes, ret, i;
__be32 *p;
- scratch = alloc_page(gfp_mask);
+ scratch = folio_alloc(gfp_mask, 0);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&xdr, scratch);
+ xdr_set_scratch_folio(&xdr, scratch);
p = xdr_inline_decode(&xdr, sizeof(__be32));
if (!p)
@@ -582,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
out_free_volumes:
kfree(volumes);
out_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out:
return node;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 86bdc7d23fb9..c8b837006bb2 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
return;
dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
- svc_xprt_destroy_all(serv, net);
+ svc_xprt_destroy_all(serv, net, false);
}
static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
ret = svc_bind(serv, net);
if (ret < 0) {
printk(KERN_WARNING "NFS: bind callback service failed\n");
- goto err_bind;
+ goto err;
}
ret = 0;
@@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
- goto err_socks;
+ goto err;
}
return 0;
-err_socks:
- svc_rpcb_cleanup(serv, net);
-err_bind:
+err:
nn->cb_users[minorversion]--;
dprintk("NFS: Couldn't create callback socket: err = %d; "
"net = %x\n", ret, net->ns.inum);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d81217923936..46d9c65d50f8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -829,17 +829,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
struct address_space *mapping = desc->file->f_mapping;
struct folio *new, *folio = *arrays;
struct xdr_stream stream;
- struct page *scratch;
+ struct folio *scratch;
struct xdr_buf buf;
u64 cookie;
int status;
- scratch = alloc_page(GFP_KERNEL);
+ scratch = folio_alloc(GFP_KERNEL, 0);
if (scratch == NULL)
return -ENOMEM;
xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
do {
status = nfs_readdir_entry_decode(desc, entry, &stream);
@@ -891,7 +891,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
if (folio != *arrays)
nfs_readdir_folio_unlock_and_put(folio);
- put_page(scratch);
+ folio_put(scratch);
return status;
}
@@ -2198,8 +2198,6 @@ no_open:
else
dput(dentry);
}
- if (IS_ERR(res))
- return PTR_ERR(res);
return finish_no_open(file, res);
}
EXPORT_SYMBOL_GPL(nfs_atomic_open);
@@ -2260,7 +2258,7 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned int open_flags,
umode_t mode)
{
-
+ struct dentry *res = NULL;
/* Same as look+open from lookup_open(), but with different O_TRUNC
* handling.
*/
@@ -2275,21 +2273,15 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
if (error)
return error;
return finish_open(file, dentry, NULL);
- } else if (d_in_lookup(dentry)) {
+ }
+ if (d_in_lookup(dentry)) {
/* The only flags nfs_lookup considers are
* LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
* we want those to be zero so the lookup isn't skipped.
*/
- struct dentry *res = nfs_lookup(dir, dentry, 0);
-
- d_lookup_done(dentry);
- if (unlikely(res)) {
- if (IS_ERR(res))
- return PTR_ERR(res);
- return finish_no_open(file, res);
- }
+ res = nfs_lookup(dir, dentry, 0);
}
- return finish_no_open(file, NULL);
+ return finish_no_open(file, res);
}
EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8059ece82468..d020aab40c64 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -161,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t result;
+ trace_nfs_file_read(iocb, to);
+
if (iocb->ki_flags & IOCB_DIRECT)
return nfs_file_direct_read(iocb, to, false);
@@ -361,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
if (pnfs_ld_read_whole_page(file_inode(file)))
return true;
+ if (folio_test_dropbehind(folio))
+ return false;
/* Open for reading too? */
if (file->f_mode & FMODE_READ)
return true;
@@ -380,22 +384,23 @@ static int nfs_write_begin(const struct kiocb *iocb,
loff_t pos, unsigned len, struct folio **foliop,
void **fsdata)
{
- fgf_t fgp = FGP_WRITEBEGIN;
struct folio *folio;
struct file *file = iocb->ki_filp;
int once_thru = 0;
int ret;
+ trace_nfs_write_begin(file_inode(file), pos, len);
+
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
- fgp |= fgf_set_order(len);
start:
- folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out;
+ }
*foliop = folio;
ret = nfs_flush_incompatible(file, folio);
@@ -405,11 +410,14 @@ start:
} else if (!once_thru &&
nfs_want_read_modify_write(file, folio, pos, len)) {
once_thru = 1;
+ folio_clear_dropbehind(folio);
ret = nfs_read_folio(file, folio);
folio_put(folio);
if (!ret)
goto start;
}
+out:
+ trace_nfs_write_begin_done(file_inode(file), pos, len, ret);
return ret;
}
@@ -423,6 +431,7 @@ static int nfs_write_end(const struct kiocb *iocb,
unsigned offset = offset_in_folio(folio, pos);
int status;
+ trace_nfs_write_end(file_inode(file), pos, len);
dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
@@ -451,13 +460,16 @@ static int nfs_write_end(const struct kiocb *iocb,
folio_unlock(folio);
folio_put(folio);
- if (status < 0)
+ if (status < 0) {
+ trace_nfs_write_end_done(file_inode(file), pos, len, status);
return status;
+ }
NFS_I(mapping->host)->write_io += copied;
if (nfs_ctx_key_to_expire(ctx, mapping->host))
nfs_wb_all(mapping->host);
+ trace_nfs_write_end_done(file_inode(file), pos, len, copied);
return copied;
}
@@ -690,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
errseq_t since;
int error;
+ trace_nfs_file_write(iocb, from);
+
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
@@ -949,5 +963,6 @@ const struct file_operations nfs_file_operations = {
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+ .fop_flags = FOP_DONTCACHE,
};
EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d39a1f58e18d..5c4551117c58 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -646,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
{
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
__be32 *p;
uint32_t nfl_util;
int i;
dprintk("%s: set_layout_map Begin\n", __func__);
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
return -ENOMEM;
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
* num_fh (4) */
@@ -724,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
fl->fh_array[i]->size);
}
- __free_page(scratch);
+ folio_put(scratch);
return 0;
out_err:
- __free_page(scratch);
+ folio_put(scratch);
return -EIO;
}
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 29d9234d5c08..df79aeb68db4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -73,18 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct nfs4_file_layout_dsaddr *dsaddr = NULL;
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
struct net *net = server->nfs_client->cl_net;
/* set up xdr stream */
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
goto out_err;
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* Get the stripe count (number of stripe index) */
p = xdr_inline_decode(&stream, 4);
@@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
}
}
- __free_page(scratch);
+ folio_put(scratch);
return dsaddr;
out_err_drain_dsaddrs:
@@ -204,7 +204,7 @@ out_err_free_deviceid:
out_err_free_stripe_indices:
kfree(stripe_indices);
out_err_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out_err:
dprintk("%s ERROR: returning NULL\n", __func__);
return NULL;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 9edb5f9b0c4e..df01d2876b68 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -47,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
int dev_limit, enum nfs4_ff_op_type type);
static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
- struct nfs4_ff_layout_mirror *mirror);
+ struct nfs4_ff_layout_ds_stripe *dss_info);
static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
@@ -164,31 +164,32 @@ decode_name(struct xdr_stream *xdr, u32 *id)
}
static struct nfsd_file *
-ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id,
struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, fmode_t mode)
{
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
- return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode);
+ return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode);
#else
return NULL;
#endif
}
-static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
- const struct nfs4_ff_layout_mirror *m2)
+static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1,
+ const struct nfs4_ff_layout_ds_stripe *dss2)
{
int i, j;
- if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ if (dss1->fh_versions_cnt != dss2->fh_versions_cnt)
return false;
- for (i = 0; i < m1->fh_versions_cnt; i++) {
+
+ for (i = 0; i < dss1->fh_versions_cnt; i++) {
bool found_fh = false;
- for (j = 0; j < m2->fh_versions_cnt; j++) {
- if (nfs_compare_fh(&m1->fh_versions[i],
- &m2->fh_versions[j]) == 0) {
+ for (j = 0; j < dss2->fh_versions_cnt; j++) {
+ if (nfs_compare_fh(&dss1->fh_versions[i],
+ &dss2->fh_versions[j]) == 0) {
found_fh = true;
break;
}
@@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return true;
}
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ u32 dss_id;
+
+ if (m1->dss_count != m2->dss_count)
+ return false;
+
+ for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+ if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id]))
+ return false;
+
+ return true;
+}
+
+static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ u32 dss_id;
+
+ if (m1->dss_count != m2->dss_count)
+ return false;
+
+ for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+ if (memcmp(&m1->dss[dss_id].devid,
+ &m2->dss[dss_id].devid,
+ sizeof(m1->dss[dss_id].devid)) != 0)
+ return false;
+
+ return true;
+}
+
static struct nfs4_ff_layout_mirror *
ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
struct nfs4_ff_layout_mirror *mirror)
@@ -209,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
spin_lock(&inode->i_lock);
list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
- if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
+ if (!ff_mirror_match_devid(mirror, pos))
continue;
if (!ff_mirror_match_fh(mirror, pos))
continue;
@@ -240,29 +273,37 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
{
struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
mirror = kzalloc(sizeof(*mirror), gfp_flags);
if (mirror != NULL) {
spin_lock_init(&mirror->lock);
refcount_set(&mirror->ref, 1);
INIT_LIST_HEAD(&mirror->mirrors);
- nfs_localio_file_init(&mirror->nfl);
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+ nfs_localio_file_init(&mirror->dss[dss_id].nfl);
}
return mirror;
}
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
- const struct cred *cred;
+ const struct cred *cred;
+ u32 dss_id;
ff_layout_remove_mirror(mirror);
- kfree(mirror->fh_versions);
- nfs_close_local_fh(&mirror->nfl);
- cred = rcu_access_pointer(mirror->ro_cred);
- put_cred(cred);
- cred = rcu_access_pointer(mirror->rw_cred);
- put_cred(cred);
- nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ kfree(mirror->dss[dss_id].fh_versions);
+ cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred);
+ put_cred(cred);
+ cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred);
+ put_cred(cred);
+ nfs_close_local_fh(&mirror->dss[dss_id].nfl);
+ nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds);
+ }
+
+ kfree(mirror->dss);
kfree(mirror);
}
@@ -366,14 +407,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
free_me);
}
+static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror)
+{
+ u32 dss_id, sum = 0;
+
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+ sum += mirror->dss[dss_id].efficiency;
+
+ return sum;
+}
+
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
int i, j;
for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
for (j = i + 1; j < fls->mirror_array_cnt; j++)
- if (fls->mirror_array[i]->efficiency <
- fls->mirror_array[j]->efficiency)
+ if (ff_mirror_efficiency_sum(fls->mirror_array[i]) <
+ ff_mirror_efficiency_sum(fls->mirror_array[j]))
swap(fls->mirror_array[i],
fls->mirror_array[j]);
}
@@ -388,20 +439,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_ff_layout_segment *fls = NULL;
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
u64 stripe_unit;
u32 mirror_array_cnt;
__be32 *p;
int i, rc;
+ struct nfs4_ff_layout_ds_stripe *dss_info;
dprintk("--> %s\n", __func__);
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
return ERR_PTR(-ENOMEM);
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
lgr->layoutp->len);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* stripe unit and mirror_array_cnt */
rc = -EIO;
@@ -427,23 +479,32 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array_cnt = mirror_array_cnt;
fls->stripe_unit = stripe_unit;
+ u32 dss_count = 0;
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
struct cred *kcred;
const struct cred __rcu *cred;
kuid_t uid;
kgid_t gid;
- u32 ds_count, fh_count, id;
- int j;
+ u32 fh_count, id;
+ int j, dss_id;
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
- ds_count = be32_to_cpup(p);
- /* FIXME: allow for striping? */
- if (ds_count != 1)
+ // Ensure all mirrors have same stripe count.
+ if (dss_count == 0)
+ dss_count = be32_to_cpup(p);
+ else if (dss_count != be32_to_cpup(p))
+ goto out_err_free;
+
+ if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT ||
+ dss_count == 0)
+ goto out_err_free;
+
+ if (dss_count > 1 && stripe_unit == 0)
goto out_err_free;
fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
@@ -452,91 +513,105 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
}
- fls->mirror_array[i]->ds_count = ds_count;
+ fls->mirror_array[i]->dss_count = dss_count;
+ fls->mirror_array[i]->dss =
+ kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
+ gfp_flags);
- /* deviceid */
- rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
- if (rc)
- goto out_err_free;
+ for (dss_id = 0; dss_id < dss_count; dss_id++) {
+ dss_info = &fls->mirror_array[i]->dss[dss_id];
+ dss_info->mirror = fls->mirror_array[i];
- /* efficiency */
- rc = -EIO;
- p = xdr_inline_decode(&stream, 4);
- if (!p)
- goto out_err_free;
- fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+ /* deviceid */
+ rc = decode_deviceid(&stream, &dss_info->devid);
+ if (rc)
+ goto out_err_free;
- /* stateid */
- rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
- if (rc)
- goto out_err_free;
+ /* efficiency */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ dss_info->efficiency = be32_to_cpup(p);
- /* fh */
- rc = -EIO;
- p = xdr_inline_decode(&stream, 4);
- if (!p)
- goto out_err_free;
- fh_count = be32_to_cpup(p);
+ /* stateid */
+ rc = decode_pnfs_stateid(&stream, &dss_info->stateid);
+ if (rc)
+ goto out_err_free;
- fls->mirror_array[i]->fh_versions =
- kcalloc(fh_count, sizeof(struct nfs_fh),
- gfp_flags);
- if (fls->mirror_array[i]->fh_versions == NULL) {
- rc = -ENOMEM;
- goto out_err_free;
- }
+ /* fh */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fh_count = be32_to_cpup(p);
- for (j = 0; j < fh_count; j++) {
- rc = decode_nfs_fh(&stream,
- &fls->mirror_array[i]->fh_versions[j]);
+ dss_info->fh_versions =
+ kcalloc(fh_count, sizeof(struct nfs_fh),
+ gfp_flags);
+ if (dss_info->fh_versions == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ for (j = 0; j < fh_count; j++) {
+ rc = decode_nfs_fh(&stream,
+ &dss_info->fh_versions[j]);
+ if (rc)
+ goto out_err_free;
+ }
+
+ dss_info->fh_versions_cnt = fh_count;
+
+ /* user */
+ rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
- }
- fls->mirror_array[i]->fh_versions_cnt = fh_count;
+ uid = make_kuid(&init_user_ns, id);
- /* user */
- rc = decode_name(&stream, &id);
- if (rc)
- goto out_err_free;
+ /* group */
+ rc = decode_name(&stream, &id);
+ if (rc)
+ goto out_err_free;
- uid = make_kuid(&init_user_ns, id);
+ gid = make_kgid(&init_user_ns, id);
- /* group */
- rc = decode_name(&stream, &id);
- if (rc)
- goto out_err_free;
+ if (gfp_flags & __GFP_FS)
+ kcred = prepare_kernel_cred(&init_task);
+ else {
+ unsigned int nofs_flags = memalloc_nofs_save();
- gid = make_kgid(&init_user_ns, id);
+ kcred = prepare_kernel_cred(&init_task);
+ memalloc_nofs_restore(nofs_flags);
+ }
+ rc = -ENOMEM;
+ if (!kcred)
+ goto out_err_free;
+ kcred->fsuid = uid;
+ kcred->fsgid = gid;
+ cred = RCU_INITIALIZER(kcred);
- if (gfp_flags & __GFP_FS)
- kcred = prepare_kernel_cred(&init_task);
- else {
- unsigned int nofs_flags = memalloc_nofs_save();
- kcred = prepare_kernel_cred(&init_task);
- memalloc_nofs_restore(nofs_flags);
+ if (lgr->range.iomode == IOMODE_READ)
+ rcu_assign_pointer(dss_info->ro_cred, cred);
+ else
+ rcu_assign_pointer(dss_info->rw_cred, cred);
}
- rc = -ENOMEM;
- if (!kcred)
- goto out_err_free;
- kcred->fsuid = uid;
- kcred->fsgid = gid;
- cred = RCU_INITIALIZER(kcred);
-
- if (lgr->range.iomode == IOMODE_READ)
- rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
- else
- rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
if (mirror != fls->mirror_array[i]) {
- /* swap cred ptrs so free_mirror will clean up old */
- if (lgr->range.iomode == IOMODE_READ) {
- cred = xchg(&mirror->ro_cred, cred);
- rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
- } else {
- cred = xchg(&mirror->rw_cred, cred);
- rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+ for (dss_id = 0; dss_id < dss_count; dss_id++) {
+ dss_info = &fls->mirror_array[i]->dss[dss_id];
+ /* swap cred ptrs so free_mirror will clean up old */
+ if (lgr->range.iomode == IOMODE_READ) {
+ cred = xchg(&mirror->dss[dss_id].ro_cred,
+ dss_info->ro_cred);
+ rcu_assign_pointer(dss_info->ro_cred, cred);
+ } else {
+ cred = xchg(&mirror->dss[dss_id].rw_cred,
+ dss_info->rw_cred);
+ rcu_assign_pointer(dss_info->rw_cred, cred);
+ }
}
ff_layout_free_mirror(fls->mirror_array[i]);
fls->mirror_array[i] = mirror;
@@ -564,7 +639,7 @@ out_sort_mirrors:
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
- __free_page(scratch);
+ folio_put(scratch);
return ret;
out_err_free:
_ff_layout_free_lseg(fls);
@@ -593,6 +668,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
_ff_layout_free_lseg(fls);
}
+static u32 calc_commit_idx(struct pnfs_layout_segment *lseg,
+ u32 mirror_idx, u32 dss_id)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+ return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id;
+}
+
+static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg,
+ u32 commit_index)
+{
+ return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
+static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg,
+ u32 commit_index)
+{
+ return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
@@ -617,6 +712,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
struct nfs4_ff_layoutstat *layoutstat,
ktime_t now)
{
@@ -624,8 +720,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
- if (!mirror->start_time)
- mirror->start_time = now;
+ if (!mirror->dss[dss_id].start_time)
+ mirror->dss[dss_id].start_time = now;
if (mirror->report_interval != 0)
report_interval = (s64)mirror->report_interval * 1000LL;
else if (layoutstats_timer != 0)
@@ -675,13 +771,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
static void
nfs4_ff_layout_stat_io_start_read(struct inode *inode,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
- nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ report = nfs4_ff_layoutstat_start_io(
+ mirror, dss_id, &mirror->dss[dss_id].read_stat, now);
+ nfs4_ff_layout_stat_io_update_requested(
+ &mirror->dss[dss_id].read_stat, requested);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
@@ -692,11 +791,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
static void
nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested,
__u64 completed)
{
spin_lock(&mirror->lock);
- nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+ nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat,
requested, completed,
ktime_get(), task->tk_start);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
@@ -706,13 +806,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
static void
nfs4_ff_layout_stat_io_start_write(struct inode *inode,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
- nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ report = nfs4_ff_layoutstat_start_io(
+ mirror,
+ dss_id,
+ &mirror->dss[dss_id].write_stat,
+ now);
+ nfs4_ff_layout_stat_io_update_requested(
+ &mirror->dss[dss_id].write_stat,
+ requested);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
@@ -723,6 +830,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
static void
nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested,
__u64 completed,
enum nfs3_stable_how committed)
@@ -731,25 +839,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
requested = completed = 0;
spin_lock(&mirror->lock);
- nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+ nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat,
requested, completed, ktime_get(), task->tk_start);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
}
static void
-ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
if (devid)
nfs4_mark_deviceid_unavailable(devid);
}
static void
-ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
if (devid)
nfs4_mark_deviceid_available(devid);
@@ -758,6 +866,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id,
bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
@@ -768,12 +877,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ *dss_id = nfs4_ff_layout_calc_dss_id(
+ fls->stripe_unit,
+ fls->mirror_array[idx]->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false);
if (IS_ERR(ds))
continue;
if (check_device &&
- nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+ nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) {
// reinitialize the error state in case if this is the last iteration
ds = ERR_PTR(-EINVAL);
continue;
@@ -788,42 +901,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
- return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id, false);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
- return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id, true);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
struct nfs4_pnfs_ds *ds;
- ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id);
if (!IS_ERR(ds))
return ds;
- return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id);
}
static struct nfs4_pnfs_ds *
ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
- u32 *best_idx)
+ u32 *best_idx,
+ u32 offset,
+ u32 *dss_id)
{
struct pnfs_layout_segment *lseg = pgio->pg_lseg;
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
- best_idx);
+ best_idx, offset, dss_id);
if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
- return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+ return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx,
+ offset, dss_id);
}
static void
@@ -842,6 +965,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
}
+static bool
+ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls)
+{
+ return fls->mirror_array[0]->dss_count > 1;
+}
+
+/*
+ * ff_layout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ unsigned int size;
+ u64 p_stripe, r_stripe;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
+ else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
+
+ /* see if req and prev are in the same stripe */
+ if (prev) {
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ if (p_stripe != r_stripe)
+ return 0;
+ }
+
+ /* calculate remaining bytes in the current stripe */
+ div_u64_rem((u64)req_offset(req) - segment_offset,
+ stripe_unit,
+ &stripe_offset);
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
@@ -849,7 +1022,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_pgio_mirror *pgm;
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
- u32 ds_idx;
+ u32 ds_idx, dss_id;
if (NFS_SERVER(pgio->pg_inode)->flags &
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
@@ -870,7 +1043,8 @@ retry:
/* Reset wb_nio, since getting layout segment was successful */
req->wb_nio = 0;
- ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
+ ds = ff_layout_get_ds_for_read(pgio, &ds_idx,
+ req_offset(req), &dss_id);
if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -882,7 +1056,7 @@ retry:
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
pgm = &pgio->pg_mirrors[0];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+ pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize;
pgio->pg_mirror_idx = ds_idx;
return;
@@ -919,7 +1093,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
struct nfs4_pnfs_ds *ds;
- u32 i;
+ u32 i, dss_id;
retry:
pnfs_generic_pg_check_layout(pgio, req);
@@ -944,7 +1118,12 @@ retry:
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit,
+ mirror->dss_count,
+ req_offset(req));
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror,
+ dss_id, true);
if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -954,7 +1133,7 @@ retry:
goto retry;
}
pgm = &pgio->pg_mirrors[i];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+ pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize;
}
if (NFS_SERVER(pgio->pg_inode)->flags &
@@ -1020,14 +1199,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
.pg_init = ff_layout_pg_init_read,
- .pg_test = pnfs_generic_pg_test,
+ .pg_test = ff_layout_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
.pg_cleanup = pnfs_generic_pg_cleanup,
};
static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
.pg_init = ff_layout_pg_init_write,
- .pg_test = pnfs_generic_pg_test,
+ .pg_test = ff_layout_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
.pg_cleanup = pnfs_generic_pg_cleanup,
@@ -1075,9 +1254,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
+ u32 dss_id = 0;
struct nfs4_pnfs_ds *ds;
- ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+ ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx,
+ hdr->args.offset, &dss_id);
if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
else
@@ -1114,11 +1295,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
struct pnfs_layout_hdr *lo = lseg->pls_layout;
struct inode *inode = lo->plh_inode;
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
switch (op_status) {
@@ -1215,9 +1396,9 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
u32 op_status,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
switch (op_status) {
case NFS_OK:
@@ -1281,12 +1462,12 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
if (task->tk_status >= 0) {
- ff_layout_mark_ds_reachable(lseg, idx);
+ ff_layout_mark_ds_reachable(lseg, idx, dss_id);
return 0;
}
@@ -1297,10 +1478,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
switch (vers) {
case 3:
return ff_layout_async_handle_error_v3(task, op_status, clp,
- lseg, idx);
+ lseg, idx, dss_id);
case 4:
return ff_layout_async_handle_error_v4(task, op_status, state,
- clp, lseg, idx);
+ clp, lseg, idx, dss_id);
default:
/* should never happen */
WARN_ON_ONCE(1);
@@ -1309,7 +1490,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
}
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
- u32 idx, u64 offset, u64 length,
+ u32 idx, u32 dss_id, u64 offset, u64 length,
u32 *op_status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -1347,7 +1528,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, offset, length, status, opnum,
+ mirror, dss_id, offset, length, status, opnum,
nfs_io_gfp_mask());
switch (status) {
@@ -1356,7 +1537,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case NFS4ERR_PERM:
break;
case NFS4ERR_NXIO:
- ff_layout_mark_ds_unreachable(lseg, idx);
+ ff_layout_mark_ds_unreachable(lseg, idx, dss_id);
/*
* Don't return the layout if this is a read and we still
* have layouts to try
@@ -1376,10 +1557,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+ u32 dss_id = nfs4_ff_layout_calc_dss_id(
+ flseg->stripe_unit,
+ flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+ hdr->args.offset);
int err;
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ ff_layout_io_track_ds_error(hdr->lseg,
+ hdr->pgio_mirror_idx, dss_id,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_READ,
task->tk_status);
@@ -1389,7 +1576,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
err = ff_layout_async_handle_error(task, hdr->res.op_status,
hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
- hdr->pgio_mirror_idx);
+ hdr->pgio_mirror_idx,
+ dss_id);
trace_nfs4_pnfs_read(hdr, err);
clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1445,23 +1633,47 @@ ff_layout_set_layoutcommit(struct inode *inode,
static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_start_read(hdr->inode,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- task->tk_start);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_start_read(
+ hdr->inode,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ task->tk_start);
}
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- hdr->res.count);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_end_read(
+ task,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ hdr->res.count);
set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
}
@@ -1549,11 +1761,17 @@ static void ff_layout_read_release(void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+ u32 dss_id = nfs4_ff_layout_calc_dss_id(
+ flseg->stripe_unit,
+ flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+ hdr->args.offset);
loff_t end_offs = 0;
int err;
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ ff_layout_io_track_ds_error(hdr->lseg,
+ hdr->pgio_mirror_idx, dss_id,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_WRITE,
task->tk_status);
@@ -1563,7 +1781,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
err = ff_layout_async_handle_error(task, hdr->res.op_status,
hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
- hdr->pgio_mirror_idx);
+ hdr->pgio_mirror_idx,
+ dss_id);
trace_nfs4_pnfs_write(hdr, err);
clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1601,9 +1820,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
int err;
+ u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index);
+ u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index);
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+ ff_layout_io_track_ds_error(data->lseg, idx, dss_id,
data->args.offset, data->args.count,
&data->res.op_status, OP_COMMIT,
task->tk_status);
@@ -1611,8 +1832,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
}
err = ff_layout_async_handle_error(task, data->res.op_status,
- NULL, data->ds_clp, data->lseg,
- data->ds_commit_index);
+ NULL, data->ds_clp, data->lseg, idx,
+ dss_id);
trace_nfs4_pnfs_commit_ds(data, err);
switch (err) {
@@ -1631,30 +1852,54 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
}
ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
-
return 0;
}
static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_start_write(hdr->inode,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- task->tk_start);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_start_write(
+ hdr->inode,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ task->tk_start);
}
static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_end_write(
+ task,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ hdr->res.count,
+ hdr->res.verf->committed);
set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
}
@@ -1737,10 +1982,16 @@ static void ff_layout_write_release(void *data)
static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
+ u32 idx, dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
return;
+
+ idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+ dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
nfs4_ff_layout_stat_io_start_write(cdata->inode,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ FF_LAYOUT_COMP(cdata->lseg, idx),
+ dss_id,
0, task->tk_start);
}
@@ -1749,6 +2000,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
{
struct nfs_page *req;
__u64 count = 0;
+ u32 idx, dss_id;
if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
return;
@@ -1757,8 +2009,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
list_for_each_entry(req, &cdata->pages, wb_list)
count += req->wb_bytes;
}
+
+ idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+ dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ FF_LAYOUT_COMP(cdata->lseg, idx),
+ dss_id,
count, count, NFS_FILE_SYNC);
set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
}
@@ -1872,6 +2128,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
u32 idx = hdr->pgio_mirror_idx;
int vers;
struct nfs_fh *fh;
+ u32 dss_id;
bool ds_fatal_error = false;
dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
@@ -1879,22 +2136,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->args.pgbase, (size_t)hdr->args.count, offset);
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(lseg)->stripe_unit,
+ mirror->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false);
if (IS_ERR(ds)) {
ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
}
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- hdr->inode);
+ hdr->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1902,11 +2163,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->pgio_done_cb = ff_layout_read_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- fh = nfs4_ff_layout_select_ds_fh(mirror);
+ fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
if (fh)
hdr->args.fh = fh;
- nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+ nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1916,7 +2177,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->mds_offset = offset;
/* Start IO accounting for local read */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ);
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
+ FMODE_READ);
if (localio) {
hdr->task.tk_start = ktime_get();
ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
@@ -1953,25 +2215,30 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
int vers;
struct nfs_fh *fh;
u32 idx = hdr->pgio_mirror_idx;
+ u32 dss_id;
bool ds_fatal_error = false;
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(lseg)->stripe_unit,
+ mirror->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
if (IS_ERR(ds)) {
ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
}
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- hdr->inode);
+ hdr->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1981,12 +2248,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->pgio_done_cb = ff_layout_write_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- hdr->ds_commit_idx = idx;
- fh = nfs4_ff_layout_select_ds_fh(mirror);
+ hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id);
+ fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
if (fh)
hdr->args.fh = fh;
- nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+ nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1995,7 +2262,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->args.offset = offset;
/* Start IO accounting for local write */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
hdr->task.tk_start = ktime_get();
@@ -2019,20 +2286,15 @@ out_failed:
return PNFS_NOT_ATTEMPTED;
}
-static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
-{
- return i;
-}
-
static struct nfs_fh *
-select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id)
{
struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
/* FIXME: Assume that there is only one NFS version available
* for the DS.
*/
- return &flseg->mirror_array[i]->fh_versions[0];
+ return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0];
}
static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
@@ -2043,7 +2305,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct nfsd_file *localio;
struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
- u32 idx;
+ u32 idx, dss_id;
int vers, ret;
struct nfs_fh *fh;
@@ -2051,22 +2313,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
goto out_err;
- idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index);
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
if (IS_ERR(ds))
goto out_err;
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- data->inode);
+ data->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_err;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id);
if (!ds_cred)
goto out_err;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -2075,12 +2338,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
data->cred = ds_cred;
refcount_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
- fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ fh = select_ds_fh_from_commit(lseg, idx, dss_id);
if (fh)
data->args.fh = fh;
/* Start IO accounting for local commit */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
data->task.tk_start = ktime_get();
@@ -2144,25 +2407,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
struct nfs4_pnfs_ds *ds;
struct nfs_client *ds_clp;
struct rpc_clnt *clnt;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
mirror = flseg->mirror_array[idx];
- mirror_ds = mirror->mirror_ds;
- if (IS_ERR_OR_NULL(mirror_ds))
- continue;
- ds = mirror->mirror_ds->ds;
- if (!ds)
- continue;
- ds_clp = ds->ds_clp;
- if (!ds_clp)
- continue;
- clnt = ds_clp->cl_rpcclient;
- if (!clnt)
- continue;
- if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg))
- continue;
- rpc_clnt_disconnect(clnt);
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ mirror_ds = mirror->dss[dss_id].mirror_ds;
+ if (IS_ERR_OR_NULL(mirror_ds))
+ continue;
+ ds = mirror->dss[dss_id].mirror_ds->ds;
+ if (!ds)
+ continue;
+ ds_clp = ds->ds_clp;
+ if (!ds_clp)
+ continue;
+ clnt = ds_clp->cl_rpcclient;
+ if (!clnt)
+ continue;
+ if (!rpc_cancel_tasks(clnt, -EAGAIN,
+ ff_layout_match_io, lseg))
+ continue;
+ rpc_clnt_disconnect(clnt);
+ }
}
}
@@ -2184,8 +2450,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
struct inode *inode = lseg->pls_layout->plh_inode;
struct pnfs_commit_array *array, *new;
+ u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count;
- new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+ new = pnfs_alloc_commit_array(size,
nfs_io_gfp_mask());
if (new) {
spin_lock(&inode->i_lock);
@@ -2549,11 +2816,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr,
static void
ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_ds_stripe *dss_info)
{
struct nfs4_pnfs_ds_addr *da;
- struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
- struct nfs_fh *fh = &mirror->fh_versions[0];
+ struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds;
+ struct nfs_fh *fh = &dss_info->fh_versions[0];
__be32 *p;
da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
@@ -2565,13 +2832,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
p = xdr_reserve_space(xdr, 4 + fh->size);
xdr_encode_opaque(p, fh->data, fh->size);
/* ff_io_latency4 read */
- spin_lock(&mirror->lock);
- ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+ spin_lock(&dss_info->mirror->lock);
+ ff_layout_encode_io_latency(xdr,
+ &dss_info->read_stat.io_stat);
/* ff_io_latency4 write */
- ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
- spin_unlock(&mirror->lock);
+ ff_layout_encode_io_latency(xdr,
+ &dss_info->write_stat.io_stat);
+ spin_unlock(&dss_info->mirror->lock);
/* nfstime4 */
- ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+ ff_layout_encode_nfstime(xdr,
+ ktime_sub(ktime_get(),
+ dss_info->start_time));
/* bool */
p = xdr_reserve_space(xdr, 4);
*p = cpu_to_be32(false);
@@ -2595,7 +2866,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
static void
ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
{
- struct nfs4_ff_layout_mirror *mirror = opaque->data;
+ struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data;
+ struct nfs4_ff_layout_mirror *mirror = dss_info->mirror;
ff_layout_put_mirror(mirror);
}
@@ -2612,37 +2884,47 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
{
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_ff_layout_ds_stripe *dss_info;
struct nfs4_deviceid_node *dev;
- int i = 0;
+ int i = 0, dss_id;
list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
- if (i >= dev_limit)
- break;
- if (IS_ERR_OR_NULL(mirror->mirror_ds))
- continue;
- if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
- &mirror->flags) &&
- type != NFS4_FF_OP_LAYOUTRETURN)
- continue;
- /* mirror refcount put in cleanup_layoutstats */
- if (!refcount_inc_not_zero(&mirror->ref))
- continue;
- dev = &mirror->mirror_ds->id_node;
- memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
- devinfo->offset = 0;
- devinfo->length = NFS4_MAX_UINT64;
- spin_lock(&mirror->lock);
- devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
- devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
- devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
- devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
- spin_unlock(&mirror->lock);
- devinfo->layout_type = LAYOUT_FLEX_FILES;
- devinfo->ld_private.ops = &layoutstat_ops;
- devinfo->ld_private.data = mirror;
-
- devinfo++;
- i++;
+ for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) {
+ dss_info = &mirror->dss[dss_id];
+ if (i >= dev_limit)
+ break;
+ if (IS_ERR_OR_NULL(dss_info->mirror_ds))
+ continue;
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
+ &mirror->flags) &&
+ type != NFS4_FF_OP_LAYOUTRETURN)
+ continue;
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!refcount_inc_not_zero(&mirror->ref))
+ continue;
+ dev = &dss_info->mirror_ds->id_node;
+ memcpy(&devinfo->dev_id,
+ &dev->deviceid,
+ NFS4_DEVICEID4_SIZE);
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ spin_lock(&mirror->lock);
+ devinfo->read_count =
+ dss_info->read_stat.io_stat.ops_completed;
+ devinfo->read_bytes =
+ dss_info->read_stat.io_stat.bytes_completed;
+ devinfo->write_count =
+ dss_info->write_stat.io_stat.ops_completed;
+ devinfo->write_bytes =
+ dss_info->write_stat.io_stat.bytes_completed;
+ spin_unlock(&mirror->lock);
+ devinfo->layout_type = LAYOUT_FLEX_FILES;
+ devinfo->ld_private.ops = &layoutstat_ops;
+ devinfo->ld_private.data = &mirror->dss[dss_id];
+
+ devinfo++;
+ i++;
+ }
}
return i;
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 095df09017a5..17a008c8e97c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -21,6 +21,8 @@
* due to network error etc. */
#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096
+
/* LAYOUTSTATS report interval in ms */
#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
#define FF_LAYOUTSTATS_MAXDEV 4
@@ -71,12 +73,12 @@ struct nfs4_ff_layoutstat {
struct nfs4_ff_busy_timer busy_timer;
};
-struct nfs4_ff_layout_mirror {
- struct pnfs_layout_hdr *layout;
- struct list_head mirrors;
- u32 ds_count;
- u32 efficiency;
+struct nfs4_ff_layout_mirror;
+
+struct nfs4_ff_layout_ds_stripe {
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
+ u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
@@ -84,12 +86,19 @@ struct nfs4_ff_layout_mirror {
const struct cred __rcu *ro_cred;
const struct cred __rcu *rw_cred;
struct nfs_file_localio nfl;
- refcount_t ref;
- spinlock_t lock;
- unsigned long flags;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
+};
+
+struct nfs4_ff_layout_mirror {
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
+ u32 dss_count;
+ struct nfs4_ff_layout_ds_stripe *dss;
+ refcount_t ref;
+ spinlock_t lock;
+ unsigned long flags;
u32 report_interval;
};
@@ -150,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
}
static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
if (mirror != NULL) {
- struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds;
if (!IS_ERR_OR_NULL(mirror_ds))
return &mirror_ds->id_node;
@@ -182,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
}
static inline int
-nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
+{
+ return mirror->dss[dss_id].mirror_ds->ds_versions[0].version;
+}
+
+static inline u32
+nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset)
{
- return mirror->mirror_ds->ds_versions[0].version;
+ u64 tmp = offset;
+
+ if (dss_count == 1 || stripe_unit == 0)
+ return 0;
+
+ do_div(tmp, stripe_unit);
+
+ return do_div(tmp, dss_count);
}
struct nfs4_ff_layout_ds *
@@ -193,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
- struct nfs4_ff_layout_mirror *mirror, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- gfp_t gfp_flags);
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id, u64 offset, u64 length, int status,
+ enum nfs_opnum4 opnum, gfp_t gfp_flags);
void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
void ff_layout_free_ds_ioerr(struct list_head *head);
@@ -204,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
struct list_head *head,
unsigned int maxnum);
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id);
void
nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
- nfs4_stateid *stateid);
+ u32 dss_id,
+ nfs4_stateid *stateid);
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
bool fail_return);
struct rpc_clnt *
nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp,
- struct inode *inode);
+ struct inode *inode,
+ u32 dss_id);
const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
const struct pnfs_layout_range *range,
- const struct cred *mdscred);
+ const struct cred *mdscred,
+ u32 dss_id);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 30365ec782bb..c55ea8fa3bfa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -44,7 +44,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
{
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
struct nfs4_ff_layout_ds *new_ds = NULL;
@@ -56,7 +56,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
int i, ret = -ENOMEM;
/* set up xdr stream */
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
goto out_err;
@@ -70,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
INIT_LIST_HEAD(&dsaddrs);
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* multipath count */
p = xdr_inline_decode(&stream, 4);
@@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
kfree(da);
}
- __free_page(scratch);
+ folio_put(scratch);
return new_ds;
out_err_drain_dsaddrs:
@@ -177,7 +177,7 @@ out_err_drain_dsaddrs:
kfree(ds_versions);
out_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out_err:
kfree(new_ds);
@@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
}
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
- struct nfs4_ff_layout_mirror *mirror, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- gfp_t gfp_flags)
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id, u64 offset, u64 length, int status,
+ enum nfs_opnum4 opnum, gfp_t gfp_flags)
{
struct nfs4_ff_layout_ds_err *dserr;
if (status == 0)
return 0;
- if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds))
return -EINVAL;
dserr = kmalloc(sizeof(*dserr), gfp_flags);
@@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
dserr->length = length;
dserr->status = status;
dserr->opnum = opnum;
- nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
- memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+ nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid);
+ memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid,
NFS4_DEVICEID4_SIZE);
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
@@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
}
static const struct cred *
-ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id)
{
const struct cred *cred, __rcu **pcred;
if (iomode == IOMODE_READ)
- pcred = &mirror->ro_cred;
+ pcred = &mirror->dss[dss_id].ro_cred;
else
- pcred = &mirror->rw_cred;
+ pcred = &mirror->dss[dss_id].rw_cred;
rcu_read_lock();
do {
@@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
}
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
{
/* FIXME: For now assume there is only 1 version available for the DS */
- return &mirror->fh_versions[0];
+ return &mirror->dss[dss_id].fh_versions[0];
}
void
nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
- nfs4_stateid *stateid)
+ u32 dss_id,
+ nfs4_stateid *stateid)
{
- if (nfs4_ff_layout_ds_version(mirror) == 4)
- nfs4_stateid_copy(stateid, &mirror->stateid);
+ if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4)
+ nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid);
}
static bool
ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id)
{
if (mirror == NULL)
goto outerr;
- if (mirror->mirror_ds == NULL) {
+ if (mirror->dss[dss_id].mirror_ds == NULL) {
struct nfs4_deviceid_node *node;
struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
- &mirror->devid, lo->plh_lc_cred,
+ &mirror->dss[dss_id].devid, lo->plh_lc_cred,
GFP_KERNEL);
if (node)
mirror_ds = FF_LAYOUT_MIRROR_DS(node);
/* check for race with another call to this function */
- if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) &&
mirror_ds != ERR_PTR(-ENODEV))
nfs4_put_deviceid_node(node);
}
- if (IS_ERR(mirror->mirror_ds))
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
goto outerr;
return true;
@@ -352,6 +354,7 @@ outerr:
* nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
* @lseg: the layout segment we're operating on
* @mirror: layout mirror describing the DS to use
+ * @dss_id: DS stripe id to select stripe to use
* @fail_return: return layout on connect failure?
*
* Try to prepare a DS connection to accept an RPC call. This involves
@@ -368,6 +371,7 @@ outerr:
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
bool fail_return)
{
struct nfs4_pnfs_ds *ds;
@@ -376,10 +380,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
unsigned int max_payload;
int status = -EAGAIN;
- if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id))
goto noconnect;
- ds = mirror->mirror_ds->ds;
+ ds = mirror->dss[dss_id].mirror_ds->ds;
if (READ_ONCE(ds->ds_clp))
goto out;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -388,10 +392,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
*/
- status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node,
dataserver_timeo, dataserver_retrans,
- mirror->mirror_ds->ds_versions[0].version,
- mirror->mirror_ds->ds_versions[0].minor_version);
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].version,
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version);
/* connect success, check rsize/wsize limit */
if (!status) {
@@ -404,15 +408,15 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
- if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
- mirror->mirror_ds->ds_versions[0].rsize = max_payload;
- if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
- mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+ if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload)
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload;
+ if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload)
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload;
goto out;
}
noconnect:
ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, lseg->pls_range.offset,
+ mirror, dss_id, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
ff_layout_send_layouterror(lseg);
@@ -426,12 +430,13 @@ out:
const struct cred *
ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
const struct pnfs_layout_range *range,
- const struct cred *mdscred)
+ const struct cred *mdscred,
+ u32 dss_id)
{
const struct cred *cred;
- if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
- cred = ff_layout_get_mirror_cred(mirror, range->iomode);
+ if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) {
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id);
if (!cred)
cred = get_cred(mdscred);
} else {
@@ -445,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
* @mirror: pointer to the mirror
* @ds_clp: nfs_client for the DS
* @inode: pointer to inode
+ * @dss_id: DS stripe id
*
* Find or create a DS rpc client with th MDS server rpc client auth flavor
* in the nfs_client cl_ds_clients list.
*/
struct rpc_clnt *
nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
- struct nfs_client *ds_clp, struct inode *inode)
+ struct nfs_client *ds_clp, struct inode *inode,
+ u32 dss_id)
{
- switch (mirror->mirror_ds->ds_versions[0].version) {
+ switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) {
case 3:
/* For NFSv3 DS, flavor is set when creating DS connections */
return ds_clp->cl_rpcclient;
@@ -559,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (mirror) {
- if (!mirror->mirror_ds)
+ if (!mirror)
+ continue;
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ if (!mirror->dss[dss_id].mirror_ds)
return true;
- if (IS_ERR(mirror->mirror_ds))
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
continue;
- devid = &mirror->mirror_ds->id_node;
+ devid = &mirror->dss[dss_id].mirror_ds->id_node;
if (!nfs4_test_deviceid_unavailable(devid))
return true;
}
@@ -581,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (!mirror || IS_ERR(mirror->mirror_ds))
- return false;
- if (!mirror->mirror_ds)
- continue;
- devid = &mirror->mirror_ds->id_node;
- if (nfs4_test_deviceid_unavailable(devid))
+ if (!mirror)
return false;
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
+ return false;
+ if (!mirror->dss[dss_id].mirror_ds)
+ continue;
+ devid = &mirror->dss[dss_id].mirror_ds->id_node;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return false;
+ }
}
return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 9e94d18448ff..b4679b7161b0 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -1269,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
int ret;
data->context[NFS_MAX_CONTEXT_LEN] = '\0';
- ret = vfs_parse_fs_string(fc, "context",
- data->context, strlen(data->context));
+ ret = vfs_parse_fs_string(fc, "context", data->context);
if (ret < 0)
return ret;
#else
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9bdaf7f38bed..18b57c7c2f97 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1073,6 +1073,21 @@ out_no_revalidate:
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
stat->btime = NFS_I(inode)->btime;
+
+ /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN
+ * - NFS doesn't have DIO alignment constraints, avoid getting
+ * these DIO attrs from remote and just respond with most
+ * accommodating limits (so client will issue supported DIO).
+ * - this is unintuitive, but the most coarse-grained
+ * dio_offset_align is the most accommodating.
+ */
+ if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) &&
+ S_ISREG(inode->i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+ stat->dio_mem_align = 4; /* 4-byte alignment */
+ stat->dio_offset_align = PAGE_SIZE;
+ stat->dio_read_offset_align = stat->dio_offset_align;
+ }
out:
trace_nfs_getattr_exit(inode, err);
return err;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c0a44f389f8f..2ecd38e1d17a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -456,6 +456,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
+struct nfs_local_dio {
+ u32 mem_align;
+ u32 offset_align;
+ loff_t middle_offset;
+ loff_t end_offset;
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
extern void nfs_local_probe_async(struct nfs_client *);
extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 97abf62f109d..b575f0e6c7c8 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -30,6 +30,8 @@
#define NFSDBG_FACILITY NFSDBG_VFS
+#define NFSLOCAL_MAX_IOS 3
+
struct nfs_local_kiocb {
struct kiocb kiocb;
struct bio_vec *bvec;
@@ -37,6 +39,14 @@ struct nfs_local_kiocb {
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
struct nfsd_file *localio;
+ /* Begin mostly DIO-specific members */
+ size_t end_len;
+ short int end_iter_index;
+ short int n_iters;
+ bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
+ loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
+ struct iov_iter iters[NFSLOCAL_MAX_IOS];
+ /* End mostly DIO-specific members */
};
struct nfs_local_fsync_ctx {
@@ -49,11 +59,6 @@ struct nfs_local_fsync_ctx {
static bool localio_enabled __read_mostly = true;
module_param(localio_enabled, bool, 0644);
-static bool localio_O_DIRECT_semantics __read_mostly = false;
-module_param(localio_O_DIRECT_semantics, bool, 0644);
-MODULE_PARM_DESC(localio_O_DIRECT_semantics,
- "LOCALIO will use O_DIRECT semantics to filesystem.");
-
static inline bool nfs_client_is_local(const struct nfs_client *clp)
{
return !!rcu_access_pointer(clp->cl_uuid.net);
@@ -231,13 +236,13 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfsd_file __rcu **pnf,
const fmode_t mode)
{
+ int status = 0;
struct nfsd_file *localio;
localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
cred, fh, nfl, pnf, mode);
if (IS_ERR(localio)) {
- int status = PTR_ERR(localio);
- trace_nfs_local_open_fh(fh, mode, status);
+ status = PTR_ERR(localio);
switch (status) {
case -ENOMEM:
case -ENXIO:
@@ -247,6 +252,7 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
nfs_local_probe(clp);
}
}
+ trace_nfs_local_open_fh(fh, mode, status);
return localio;
}
@@ -281,23 +287,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
}
EXPORT_SYMBOL_GPL(nfs_local_open_fh);
-static struct bio_vec *
-nfs_bvec_alloc_and_import_pagevec(struct page **pagevec,
- unsigned int npages, gfp_t flags)
-{
- struct bio_vec *bvec, *p;
-
- bvec = kmalloc_array(npages, sizeof(*bvec), flags);
- if (bvec != NULL) {
- for (p = bvec; npages > 0; p++, pagevec++, npages--) {
- p->bv_page = *pagevec;
- p->bv_len = PAGE_SIZE;
- p->bv_offset = 0;
- }
- }
- return bvec;
-}
-
static void
nfs_local_iocb_free(struct nfs_local_kiocb *iocb)
{
@@ -311,40 +300,191 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
{
struct nfs_local_kiocb *iocb;
- iocb = kmalloc(sizeof(*iocb), flags);
+ iocb = kzalloc(sizeof(*iocb), flags);
if (iocb == NULL)
return NULL;
- iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec,
- hdr->page_array.npages, flags);
+
+ iocb->bvec = kmalloc_array(hdr->page_array.npages,
+ sizeof(struct bio_vec), flags);
if (iocb->bvec == NULL) {
kfree(iocb);
return NULL;
}
- if (localio_O_DIRECT_semantics &&
- test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
- iocb->kiocb.ki_filp = file;
- iocb->kiocb.ki_flags = IOCB_DIRECT;
- } else
- init_sync_kiocb(&iocb->kiocb, file);
+ init_sync_kiocb(&iocb->kiocb, file);
- iocb->kiocb.ki_pos = hdr->args.offset;
iocb->hdr = hdr;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
iocb->aio_complete_work = NULL;
+ iocb->end_iter_index = -1;
+
return iocb;
}
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+static bool
+nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
+ size_t len, struct nfs_local_dio *local_dio)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ loff_t offset = hdr->args.offset;
+ u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+ loff_t start_end, orig_end, middle_end;
+
+ nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+ &nf_dio_offset_align, &nf_dio_read_offset_align);
+ if (rw == ITER_DEST)
+ nf_dio_offset_align = nf_dio_read_offset_align;
+
+ if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
+ return false;
+ if (unlikely(nf_dio_offset_align > PAGE_SIZE))
+ return false;
+ if (unlikely(len < nf_dio_offset_align))
+ return false;
+
+ local_dio->mem_align = nf_dio_mem_align;
+ local_dio->offset_align = nf_dio_offset_align;
+
+ start_end = round_up(offset, nf_dio_offset_align);
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, nf_dio_offset_align);
+
+ local_dio->middle_offset = start_end;
+ local_dio->end_offset = middle_end;
+
+ local_dio->start_len = start_end - offset;
+ local_dio->middle_len = middle_end - start_end;
+ local_dio->end_len = orig_end - middle_end;
+
+ if (rw == ITER_DEST)
+ trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio);
+ else
+ trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio);
+ return true;
+}
+
+static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
+ unsigned int addr_mask, unsigned int len_mask)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t skip = i->iov_offset;
+ size_t size = i->count;
+
+ if (size & len_mask)
+ return false;
+ do {
+ size_t len = bvec->bv_len;
+
+ if (len > size)
+ len = size;
+ if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+ return false;
+ bvec++;
+ size -= len;
+ skip = 0;
+ } while (size);
+
+ return true;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents described by @local_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
+ unsigned int nvecs, size_t len,
+ struct nfs_local_dio *local_dio)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = iocb->iters;
+
+ /* Setup misaligned start? */
+ if (local_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iters[n_iters].count = local_dio->start_len;
+ iocb->offset[n_iters] = iocb->hdr->args.offset;
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ ++n_iters;
+ }
+
+ /* Setup misaligned end?
+ * If so, the end is purposely setup to be issued using buffered IO
+ * before the middle (which will use DIO, if DIO-aligned, with AIO).
+ * This creates problems if/when the end results in a partial write.
+ * So must save index and length of end to handle this corner case.
+ */
+ if (local_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iocb->offset[n_iters] = local_dio->end_offset;
+ iov_iter_advance(&iters[n_iters],
+ local_dio->start_len + local_dio->middle_len);
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ /* Save index and length of end */
+ iocb->end_iter_index = n_iters;
+ iocb->end_len = local_dio->end_len;
+ ++n_iters;
+ }
+
+ /* Setup DIO-aligned middle to be issued last, to allow for
+ * DIO with AIO completion (see nfs_local_call_{read,write}).
+ */
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ if (local_dio->start_len)
+ iov_iter_advance(&iters[n_iters], local_dio->start_len);
+ iters[n_iters].count -= local_dio->end_len;
+ iocb->offset[n_iters] = local_dio->middle_offset;
+
+ iocb->iter_is_dio_aligned[n_iters] =
+ nfs_iov_iter_aligned_bvec(&iters[n_iters],
+ local_dio->mem_align-1, local_dio->offset_align-1);
+
+ if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
+ trace_nfs_local_dio_misaligned(iocb->hdr->inode,
+ iocb->hdr->args.offset, len, local_dio);
+ return 0; /* no DIO-aligned IO possible */
+ }
+ ++n_iters;
+
+ iocb->n_iters = n_iters;
+ return n_iters;
+}
+
+static noinline_for_stack void
+nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
+ struct page **pagevec = hdr->page_array.pagevec;
+ unsigned long v, total;
+ unsigned int base;
+ size_t len;
+
+ v = 0;
+ total = hdr->args.count;
+ base = hdr->args.pgbase;
+ while (total && v < hdr->page_array.npages) {
+ len = min_t(size_t, total, PAGE_SIZE - base);
+ bvec_set_page(&iocb->bvec[v], *pagevec, len, base);
+ total -= len;
+ ++pagevec;
+ ++v;
+ base = 0;
+ }
+ len = hdr->args.count - total;
+
+ if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+ struct nfs_local_dio local_dio;
+
+ if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
+ nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
+ return; /* is DIO-aligned */
+ }
- iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages,
- hdr->args.count + hdr->args.pgbase);
- if (hdr->args.pgbase != 0)
- iov_iter_advance(i, hdr->args.pgbase);
+ /* Use buffered IO */
+ iocb->offset[0] = hdr->args.offset;
+ iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
+ iocb->n_iters = 1;
}
static void
@@ -367,10 +507,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
static void
nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
{
+ /* Must handle partial completions */
if (status >= 0) {
- hdr->res.count = status;
- hdr->res.op_status = NFS4_OK;
- hdr->task.tk_status = 0;
+ hdr->res.count += status;
+ /* @hdr was initialized to 0 (zeroed during allocation) */
+ if (hdr->task.tk_status == 0)
+ hdr->res.op_status = NFS4_OK;
} else {
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
@@ -378,12 +520,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
}
static void
+nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
+{
+ nfs_local_file_put(iocb->localio);
+ nfs_local_iocb_free(iocb);
+}
+
+static void
nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
- nfs_local_file_put(iocb->localio);
- nfs_local_iocb_free(iocb);
+ nfs_local_iocb_release(iocb);
nfs_local_hdr_release(hdr, hdr->task.tk_ops);
}
@@ -405,7 +553,10 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
struct nfs_pgio_header *hdr = iocb->hdr;
struct file *filp = iocb->kiocb.ki_filp;
- nfs_local_pgio_done(hdr, status);
+ if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
+ }
/*
* Must clear replen otherwise NFSv3 data corruption will occur
@@ -434,6 +585,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
+ nfs_local_pgio_done(iocb->hdr, ret);
nfs_local_read_done(iocb, ret);
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
}
@@ -444,14 +596,25 @@ static void nfs_local_call_read(struct work_struct *work)
container_of(work, struct nfs_local_kiocb, work);
struct file *filp = iocb->kiocb.ki_filp;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, READ);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
revert_creds(save_cred);
@@ -462,33 +625,17 @@ static void nfs_local_call_read(struct work_struct *work)
}
static int
-nfs_do_local_read(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_read(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without read_iter */
- if (!file->f_op->read_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_read);
queue_work(nfslocaliod_workqueue, &iocb->work);
@@ -597,7 +744,13 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
+ if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
+ }
+
/* Handle short writes as if they are ENOSPC */
+ status = hdr->res.count;
if (status > 0 && status < hdr->args.count) {
hdr->mds_offset += status;
hdr->args.offset += status;
@@ -605,11 +758,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
hdr->args.count -= status;
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
status = -ENOSPC;
+ /* record -ENOSPC in terms of nfs_local_pgio_done */
+ nfs_local_pgio_done(hdr, status);
}
- if (status < 0)
+ if (hdr->task.tk_status < 0)
nfs_reset_boot_verifier(inode);
-
- nfs_local_pgio_done(hdr, status);
}
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -626,6 +779,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
+ nfs_local_pgio_done(iocb->hdr, ret);
nfs_local_write_done(iocb, ret);
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
}
@@ -637,16 +791,53 @@ static void nfs_local_call_write(struct work_struct *work)
struct file *filp = iocb->kiocb.ki_filp;
unsigned long old_flags = current->flags;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, WRITE);
-
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+retry:
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
+ /* partial write */
+ if (i == iocb->end_iter_index) {
+ /* Must not account partial end, otherwise, due
+ * to end being issued before middle: the partial
+ * write accounting in nfs_local_write_done()
+ * would incorrectly advance hdr->args.offset
+ */
+ status = 0;
+ } else {
+ /* Partial write at start or buffered middle,
+ * exit early.
+ */
+ nfs_local_pgio_done(iocb->hdr, status);
+ break;
+ }
+ } else if (unlikely(status == -ENOTBLK &&
+ (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
+ /* VFS will return -ENOTBLK if DIO WRITE fails to
+ * invalidate the page cache. Retry using buffered IO.
+ */
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+ iocb->kiocb.ki_complete = NULL;
+ iocb->aio_complete_work = NULL;
+ goto retry;
+ }
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
file_end_write(filp);
revert_creds(save_cred);
@@ -660,26 +851,15 @@ static void nfs_local_call_write(struct work_struct *work)
}
static int
-nfs_do_local_write(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_write(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without write_iter */
- if (!file->f_op->write_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
switch (hdr->args.stable) {
default:
break;
@@ -694,43 +874,74 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_write);
queue_work(nfslocaliod_workqueue, &iocb->work);
return 0;
}
+static struct nfs_local_kiocb *
+nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
+{
+ struct file *file = nfs_to->nfsd_file_file(localio);
+ struct nfs_local_kiocb *iocb;
+ gfp_t gfp_mask;
+ int rw;
+
+ if (hdr->rw_mode & FMODE_READ) {
+ if (!file->f_op->read_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_KERNEL;
+ rw = ITER_DEST;
+ } else {
+ if (!file->f_op->write_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_NOIO;
+ rw = ITER_SOURCE;
+ }
+
+ iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
+ if (iocb == NULL)
+ return ERR_PTR(-ENOMEM);
+ iocb->hdr = hdr;
+ iocb->localio = localio;
+
+ nfs_local_iters_init(iocb, rw);
+
+ return iocb;
+}
+
int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
+ struct nfs_local_kiocb *iocb;
int status = 0;
if (!hdr->args.count)
return 0;
+ iocb = nfs_local_iocb_init(hdr, localio);
+ if (IS_ERR(iocb))
+ return PTR_ERR(iocb);
+
switch (hdr->rw_mode) {
case FMODE_READ:
- status = nfs_do_local_read(hdr, localio, call_ops);
+ status = nfs_local_do_read(iocb, call_ops);
break;
case FMODE_WRITE:
- status = nfs_do_local_write(hdr, localio, call_ops);
+ status = nfs_local_do_write(iocb, call_ops);
break;
default:
dprintk("%s: invalid mode: %d\n", __func__,
hdr->rw_mode);
- status = -EINVAL;
+ status = -EOPNOTSUPP;
}
if (status != 0) {
if (status == -EAGAIN)
nfs_localio_disable_client(clp);
- nfs_local_file_put(localio);
+ nfs_local_iocb_release(iocb);
hdr->task.tk_status = status;
nfs_local_hdr_release(hdr, call_ops);
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f9a3a1fbf44c..5a4d193da1a9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc)
nfs_errorf(fc, "NFS: Couldn't determine submount pathname");
ret = PTR_ERR(p);
} else {
- ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p);
+ ret = vfs_parse_fs_qstr(fc, "source",
+ &QSTR_LEN(p, buffer + 4096 - p));
if (!ret)
ret = vfs_get_tree(fc);
}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 6e75c6c2d234..9eff09158518 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,8 +23,8 @@
#include <linux/nfs2.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_common.h>
-#include "nfstrace.h"
#include "internal.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_XDR
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 4ae01c10b7e2..e17d72908412 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -23,8 +23,8 @@
#include <linux/nfsacl.h>
#include <linux/nfs_common.h>
-#include "nfstrace.h"
#include "internal.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_XDR
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6a0b5871ba3b..d537fb0c230e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
ret = -ENOMEM;
- res.scratch = alloc_page(GFP_KERNEL);
+ res.scratch = folio_alloc(GFP_KERNEL, 0);
if (!res.scratch)
goto out;
@@ -1552,7 +1552,7 @@ out_free_pages:
}
kfree(pages);
out_free_scratch:
- __free_page(res.scratch);
+ folio_put(res.scratch);
out:
return ret;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 4cc915d5741d..e10d83ba835e 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
struct compound_hdr hdr;
int status;
- xdr_set_scratch_page(xdr, res->scratch);
+ xdr_set_scratch_folio(xdr, res->scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c9a0d1e420c6..7f43e890d356 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -456,4 +456,5 @@ const struct file_operations nfs4_file_operations = {
#else
.llseek = nfs_file_llseek,
#endif
+ .fop_flags = FOP_DONTCACHE,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ce61253efd45..f58098417142 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -391,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
*p++ = htonl(attrs); /* bitmap */
*p++ = htonl(12); /* attribute buffer length */
*p++ = htonl(NF4DIR);
+ spin_lock(&dentry->d_lock);
p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
+ spin_unlock(&dentry->d_lock);
readdir->pgbase = (char *)p - (char *)start;
readdir->count -= readdir->pgbase;
@@ -6160,7 +6162,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
}
/* for decoding across pages */
- res.acl_scratch = alloc_page(GFP_KERNEL);
+ res.acl_scratch = folio_alloc(GFP_KERNEL, 0);
if (!res.acl_scratch)
goto out_free;
@@ -6196,7 +6198,7 @@ out_free:
while (--i >= 0)
__free_page(pages[i]);
if (res.acl_scratch)
- __free_page(res.acl_scratch);
+ folio_put(res.acl_scratch);
kfree(pages);
return ret;
}
@@ -7872,10 +7874,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
return err;
do {
err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
- if (err != -NFS4ERR_DELAY)
+ if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE)
break;
ssleep(1);
- } while (err == -NFS4ERR_DELAY);
+ } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE);
return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
}
@@ -9442,7 +9444,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
goto out;
if (rcvd->max_rqst_sz > sent->max_rqst_sz)
return -EINVAL;
- if (rcvd->max_resp_sz < sent->max_resp_sz)
+ if (rcvd->max_resp_sz > sent->max_resp_sz)
return -EINVAL;
if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
return -EINVAL;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7612e977e80b..01179f7de322 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2744,6 +2744,9 @@ out_error:
case -ENETUNREACH:
nfs_mark_client_ready(clp, -EIO);
break;
+ case -EINVAL:
+ nfs_mark_client_ready(clp, status);
+ break;
default:
ssleep(1);
break;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index b29a26923ce0..5ec9c83f1ef0 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -149,21 +149,9 @@ static int do_nfs4_mount(struct nfs_server *server,
struct fs_context *root_fc;
struct vfsmount *root_mnt;
struct dentry *dentry;
- size_t len;
+ char *source;
int ret;
- struct fs_parameter param = {
- .key = "source",
- .type = fs_value_is_string,
- .dirfd = -1,
- };
-
- struct fs_parameter param_fsc = {
- .key = "fsc",
- .type = fs_value_is_string,
- .dirfd = -1,
- };
-
if (IS_ERR(server))
return PTR_ERR(server);
@@ -181,15 +169,7 @@ static int do_nfs4_mount(struct nfs_server *server,
root_ctx->server = server;
if (ctx->fscache_uniq) {
- len = strlen(ctx->fscache_uniq);
- param_fsc.size = len;
- param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL);
- if (param_fsc.string == NULL) {
- put_fs_context(root_fc);
- return -ENOMEM;
- }
- ret = vfs_parse_fs_param(root_fc, &param_fsc);
- kfree(param_fsc.string);
+ ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq);
if (ret < 0) {
put_fs_context(root_fc);
return ret;
@@ -197,20 +177,18 @@ static int do_nfs4_mount(struct nfs_server *server,
}
/* We leave export_path unset as it's not used to find the root. */
- len = strlen(hostname) + 5;
- param.string = kmalloc(len, GFP_KERNEL);
- if (param.string == NULL) {
- put_fs_context(root_fc);
- return -ENOMEM;
- }
-
/* Does hostname needs to be enclosed in brackets? */
if (strchr(hostname, ':'))
- param.size = snprintf(param.string, len, "[%s]:/", hostname);
+ source = kasprintf(GFP_KERNEL, "[%s]:/", hostname);
else
- param.size = snprintf(param.string, len, "%s:/", hostname);
- ret = vfs_parse_fs_param(root_fc, &param);
- kfree(param.string);
+ source = kasprintf(GFP_KERNEL, "%s:/", hostname);
+
+ if (!source) {
+ put_fs_context(root_fc);
+ return -ENOMEM;
+ }
+ ret = vfs_parse_fs_string(root_fc, "source", source);
+ kfree(source);
if (ret < 0) {
put_fs_context(root_fc);
return ret;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 49ff98571fa5..1d0e6c10f921 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4930,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
}
/*
- * The prefered block size for layout directed io
+ * The preferred block size for layout directed io
*/
static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
uint32_t *res)
@@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
int status;
if (res->acl_scratch != NULL)
- xdr_set_scratch_page(xdr, res->acl_scratch);
+ xdr_set_scratch_folio(xdr, res->acl_scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 627115179795..6ce55e8e6b67 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -45,6 +45,23 @@
{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
{ BIT(NFS_INO_ODIRECT), "ODIRECT" })
+#define nfs_show_wb_flags(v) \
+ __print_flags(v, "|", \
+ { BIT(PG_BUSY), "BUSY" }, \
+ { BIT(PG_MAPPED), "MAPPED" }, \
+ { BIT(PG_FOLIO), "FOLIO" }, \
+ { BIT(PG_CLEAN), "CLEAN" }, \
+ { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \
+ { BIT(PG_INODE_REF), "INODE_REF" }, \
+ { BIT(PG_HEADLOCK), "HEADLOCK" }, \
+ { BIT(PG_TEARDOWN), "TEARDOWN" }, \
+ { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \
+ { BIT(PG_UPTODATE), "UPTODATE" }, \
+ { BIT(PG_WB_END), "WB_END" }, \
+ { BIT(PG_REMOVE), "REMOVE" }, \
+ { BIT(PG_CONTENDED1), "CONTENDED1" }, \
+ { BIT(PG_CONTENDED2), "CONTENDED2" })
+
DECLARE_EVENT_CLASS(nfs_inode_event,
TP_PROTO(
const struct inode *inode
@@ -967,7 +984,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = offset,
+ __entry->offset = offset;
__entry->count = count;
),
@@ -1017,8 +1034,8 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = offset,
- __entry->count = count,
+ __entry->offset = offset;
+ __entry->count = count;
__entry->ret = ret;
),
@@ -1051,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done);
DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio);
DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done);
+DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_update_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_begin);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_end);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_writepages);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done);
+
+DECLARE_EVENT_CLASS(nfs_kiocb_event,
+ TP_PROTO(
+ const struct kiocb *iocb,
+ const struct iov_iter *iter
+ ),
+
+ TP_ARGS(iocb, iter),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = file_inode(iocb->ki_filp);
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = iocb->ki_pos;
+ __entry->count = iov_iter_count(iter);
+ __entry->flags = iocb->ki_flags;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->count,
+ __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS)
+ )
+);
+
+#define DEFINE_NFS_KIOCB_EVENT(name) \
+ DEFINE_EVENT(nfs_kiocb_event, name, \
+ TP_PROTO( \
+ const struct kiocb *iocb, \
+ const struct iov_iter *iter \
+ ), \
+ TP_ARGS(iocb, iter))
+
+DEFINE_NFS_KIOCB_EVENT(nfs_file_read);
+DEFINE_NFS_KIOCB_EVENT(nfs_file_write);
+
TRACE_EVENT(nfs_aop_readahead,
TP_PROTO(
const struct inode *inode,
@@ -1398,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done,
)
);
+DECLARE_EVENT_CLASS(nfs_page_class,
+ TP_PROTO(
+ const struct nfs_page *req
+ ),
+
+ TP_ARGS(req),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(const struct nfs_page *__private, req)
+ __field(loff_t, offset)
+ __field(unsigned int, count)
+ __field(unsigned long, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = folio_inode(req->wb_folio);
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->req = req;
+ __entry->offset = req_offset(req);
+ __entry->count = req->wb_bytes;
+ __entry->flags = req->wb_flags;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->req, __entry->offset, __entry->count,
+ nfs_show_wb_flags(__entry->flags)
+ )
+);
+
+#define DEFINE_NFS_PAGE_EVENT(name) \
+ DEFINE_EVENT(nfs_page_class, name, \
+ TP_PROTO( \
+ const struct nfs_page *req \
+ ), \
+ TP_ARGS(req))
+
+DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup);
+DEFINE_NFS_PAGE_EVENT(nfs_do_writepage);
+
DECLARE_EVENT_CLASS(nfs_page_error_class,
TP_PROTO(
const struct inode *inode,
@@ -1599,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+DECLARE_EVENT_CLASS(nfs_local_dio_class,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t offset,
+ ssize_t count,
+ const struct nfs_local_dio *local_dio
+ ),
+ TP_ARGS(inode, offset, count, local_dio),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, offset)
+ __field(ssize_t, count)
+ __field(u32, mem_align)
+ __field(u32, offset_align)
+ __field(loff_t, start)
+ __field(ssize_t, start_len)
+ __field(loff_t, middle)
+ __field(ssize_t, middle_len)
+ __field(loff_t, end)
+ __field(ssize_t, end_len)
+ ),
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->mem_align = local_dio->mem_align;
+ __entry->offset_align = local_dio->offset_align;
+ __entry->start = offset;
+ __entry->start_len = local_dio->start_len;
+ __entry->middle = local_dio->middle_offset;
+ __entry->middle_len = local_dio->middle_len;
+ __entry->end = local_dio->end_offset;
+ __entry->end_len = local_dio->end_len;
+ ),
+ TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zd "
+ "mem_align=%u offset_align=%u "
+ "start=%llu+%zd middle=%llu+%zd end=%llu+%zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset, __entry->count,
+ __entry->mem_align, __entry->offset_align,
+ __entry->start, __entry->start_len,
+ __entry->middle, __entry->middle_len,
+ __entry->end, __entry->end_len)
+)
+
+#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \
+DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \
+ TP_PROTO(const struct inode *inode, \
+ loff_t offset, \
+ ssize_t count, \
+ const struct nfs_local_dio *local_dio),\
+ TP_ARGS(inode, offset, count, local_dio))
+
+DEFINE_NFS_LOCAL_DIO_EVENT(read);
+DEFINE_NFS_LOCAL_DIO_EVENT(write);
+DEFINE_NFS_LOCAL_DIO_EVENT(misaligned);
+
+#endif /* CONFIG_NFS_LOCALIO */
+
TRACE_EVENT(nfs_fh_to_dentry,
TP_PROTO(
const struct super_block *sb,
@@ -1713,10 +1916,10 @@ TRACE_EVENT(nfs_local_open_fh,
),
TP_printk(
- "error=%d fhandle=0x%08x mode=%s",
- __entry->error,
+ "fhandle=0x%08x mode=%s result=%d",
__entry->fhandle,
- show_fs_fmode_flags(__entry->fmode)
+ show_fs_fmode_flags(__entry->fmode),
+ __entry->error
)
);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 647c53d1418a..0fb6905736d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -296,7 +296,7 @@ static void nfs_folio_end_writeback(struct folio *folio)
{
struct nfs_server *nfss = NFS_SERVER(folio->mapping->host);
- folio_end_writeback(folio);
+ folio_end_writeback_no_dropbehind(folio);
if (atomic_long_dec_return(&nfss->writeback) <
NFS_CONGESTION_OFF_THRESH) {
nfss->write_congested = 0;
@@ -593,6 +593,7 @@ static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
if (IS_ERR(req))
return PTR_ERR(req);
+ trace_nfs_do_writepage(req);
nfs_folio_set_writeback(folio);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
@@ -656,12 +657,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
int priority = 0;
int err;
+ trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start);
+
/* Wait with writeback until write congestion eases */
if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) {
err = wait_event_killable(nfss->write_congestion_wait,
nfss->write_congested == 0);
if (err)
- return err;
+ goto out_err;
}
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
@@ -692,10 +695,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
} while (err < 0 && !nfs_error_is_fatal(err));
nfs_io_completion_put(ioc);
- if (err < 0)
- goto out_err;
- return 0;
+ if (err > 0)
+ err = 0;
out_err:
+ trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err);
return err;
}
@@ -745,6 +748,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
}
spin_unlock(&mapping->i_private_lock);
+
+ folio_end_dropbehind(folio);
}
nfs_page_group_unlock(req);
@@ -926,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
req->wb_nio = 0;
memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo,
- hdr->pgio_mirror_idx);
+ hdr->ds_commit_idx);
goto next;
}
remove_req:
@@ -1017,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
unsigned int end;
int error;
+ trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes);
end = offset + bytes;
req = nfs_lock_and_join_requests(folio);
if (IS_ERR_OR_NULL(req))
- return req;
+ goto out;
rqend = req->wb_offset + req->wb_bytes;
/*
@@ -1043,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
else
req->wb_bytes = rqend - req->wb_offset;
req->wb_nio = 0;
+out:
+ trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes,
+ PTR_ERR_OR_ZERO(req));
return req;
out_flushme:
/*
@@ -1053,6 +1062,7 @@ out_flushme:
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
error = nfs_wb_folio(folio->mapping->host, folio);
+ trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error);
return (error < 0) ? ERR_PTR(error) : NULL;
}
@@ -1090,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx,
req = nfs_setup_write_request(ctx, folio, offset, count);
if (IS_ERR(req))
return PTR_ERR(req);
+ trace_nfs_writepage_setup(req);
/* Update file length */
nfs_grow_file(folio, offset, count);
nfs_mark_uptodate(req);
@@ -1290,6 +1301,8 @@ int nfs_update_folio(struct file *file, struct folio *folio,
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
+ trace_nfs_update_folio(inode, offset, count);
+
dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count,
(long long)(folio_pos(folio) + offset));
@@ -1309,6 +1322,7 @@ int nfs_update_folio(struct file *file, struct folio *folio,
if (status < 0)
nfs_set_pageerror(mapping);
out:
+ trace_nfs_update_folio_done(inode, offset, count, status);
dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
@@ -1806,7 +1820,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
nfs_mapping_set_error(folio, status);
nfs_inode_remove_request(req);
}
- dprintk_cont(", error = %d\n", status);
+ dprintk(", error = %d\n", status);
goto next;
}
@@ -1816,11 +1830,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* We have a match */
if (folio)
nfs_inode_remove_request(req);
- dprintk_cont(" OK\n");
+ dprintk(" OK\n");
goto next;
}
/* We have a mismatch. Write the page again */
- dprintk_cont(" mismatch\n");
+ dprintk(" mismatch\n");
nfs_mark_request_dirty(req);
atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
next:
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 85ca663c052c..e010d90aeb27 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
refcount_set(&nf->nf_ref, 1);
nf->nf_may = need;
nf->nf_mark = NULL;
+ nf->nf_dio_mem_align = 0;
+ nf->nf_dio_offset_align = 0;
+ nf->nf_dio_read_offset_align = 0;
return nf;
}
@@ -1070,6 +1073,35 @@ nfsd_file_is_cached(struct inode *inode)
}
static __be32
+nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf)
+{
+ struct inode *inode = file_inode(nf->nf_file);
+ struct kstat stat;
+ __be32 status;
+
+ /* Currently only need to get DIO alignment info for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return nfs_ok;
+
+ status = fh_getattr(fhp, &stat);
+ if (status != nfs_ok)
+ return status;
+
+ trace_nfsd_file_get_dio_attrs(inode, &stat);
+
+ if (stat.result_mask & STATX_DIOALIGN) {
+ nf->nf_dio_mem_align = stat.dio_mem_align;
+ nf->nf_dio_offset_align = stat.dio_offset_align;
+ }
+ if (stat.result_mask & STATX_DIO_READ_ALIGN)
+ nf->nf_dio_read_offset_align = stat.dio_read_offset_align;
+ else
+ nf->nf_dio_read_offset_align = nf->nf_dio_offset_align;
+
+ return nfs_ok;
+}
+
+static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
struct svc_cred *cred,
struct auth_domain *client,
@@ -1187,6 +1219,8 @@ open_file:
}
status = nfserrno(ret);
trace_nfsd_file_open(nf, status);
+ if (status == nfs_ok)
+ status = nfsd_file_get_dio_attrs(fhp, nf);
}
} else
status = nfserr_jukebox;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 722b26c71e45..237a05c74211 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -54,6 +54,10 @@ struct nfsd_file {
struct list_head nf_gc;
struct rcu_head nf_rcu;
ktime_t nf_birthtime;
+
+ u32 nf_dio_mem_align;
+ u32 nf_dio_offset_align;
+ u32 nf_dio_read_offset_align;
};
int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index cb237f1b902a..9e0a37cd29d8 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -117,6 +117,16 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
return localio;
}
+static void nfsd_file_dio_alignment(struct nfsd_file *nf,
+ u32 *nf_dio_mem_align,
+ u32 *nf_dio_offset_align,
+ u32 *nf_dio_read_offset_align)
+{
+ *nf_dio_mem_align = nf->nf_dio_mem_align;
+ *nf_dio_offset_align = nf->nf_dio_offset_align;
+ *nf_dio_read_offset_align = nf->nf_dio_read_offset_align;
+}
+
static const struct nfsd_localio_operations nfsd_localio_ops = {
.nfsd_net_try_get = nfsd_net_try_get,
.nfsd_net_put = nfsd_net_put,
@@ -124,6 +134,7 @@ static const struct nfsd_localio_operations nfsd_localio_ops = {
.nfsd_file_put_local = nfsd_file_put_local,
.nfsd_file_get_local = nfsd_file_get_local,
.nfsd_file_file = nfsd_file_file,
+ .nfsd_file_dio_alignment = nfsd_file_dio_alignment,
};
void nfsd_localio_ops_init(void)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc6b776fc657..2b79129703d5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1103,89 +1103,48 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
* populating the filesystem.
*/
-/* Basically copying rpc_get_inode. */
static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
{
struct inode *inode = new_inode(sb);
- if (!inode)
- return NULL;
- /* Following advice from simple_fill_super documentation: */
- inode->i_ino = iunique(sb, NFSD_MaxReserved);
- inode->i_mode = mode;
- simple_inode_init_ts(inode);
- switch (mode & S_IFMT) {
- case S_IFDIR:
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &simple_dir_inode_operations;
- inc_nlink(inode);
- break;
- case S_IFLNK:
- inode->i_op = &simple_symlink_inode_operations;
- break;
- default:
- break;
+ if (inode) {
+ /* Following advice from simple_fill_super documentation: */
+ inode->i_ino = iunique(sb, NFSD_MaxReserved);
+ inode->i_mode = mode;
+ simple_inode_init_ts(inode);
}
return inode;
}
-static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl)
+static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name)
{
+ struct inode *dir = parent->d_inode;
+ struct dentry *dentry;
struct inode *inode;
- inode = nfsd_get_inode(dir->i_sb, mode);
+ inode = nfsd_get_inode(parent->d_sb, S_IFDIR | 0600);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry)) {
+ iput(inode);
+ return dentry;
+ }
+ inode->i_fop = &simple_dir_operations;
+ inode->i_op = &simple_dir_inode_operations;
+ inc_nlink(inode);
if (ncl) {
inode->i_private = ncl;
kref_get(&ncl->cl_ref);
}
- d_add(dentry, inode);
+ d_instantiate(dentry, inode);
inc_nlink(dir);
fsnotify_mkdir(dir, dentry);
- return 0;
-}
-
-static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name)
-{
- struct inode *dir = parent->d_inode;
- struct dentry *dentry;
- int ret = -ENOMEM;
-
- inode_lock(dir);
- dentry = d_alloc_name(parent, name);
- if (!dentry)
- goto out_err;
- ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl);
- if (ret)
- goto out_err;
-out:
inode_unlock(dir);
return dentry;
-out_err:
- dput(dentry);
- dentry = ERR_PTR(ret);
- goto out;
}
#if IS_ENABLED(CONFIG_SUNRPC_GSS)
-static int __nfsd_symlink(struct inode *dir, struct dentry *dentry,
- umode_t mode, const char *content)
-{
- struct inode *inode;
-
- inode = nfsd_get_inode(dir->i_sb, mode);
- if (!inode)
- return -ENOMEM;
-
- inode->i_link = (char *)content;
- inode->i_size = strlen(content);
-
- d_add(dentry, inode);
- inc_nlink(dir);
- fsnotify_create(dir, dentry);
- return 0;
-}
-
/*
* @content is assumed to be a NUL-terminated string that lives
* longer than the symlink itself.
@@ -1194,17 +1153,25 @@ static void _nfsd_symlink(struct dentry *parent, const char *name,
const char *content)
{
struct inode *dir = parent->d_inode;
+ struct inode *inode;
struct dentry *dentry;
- int ret;
- inode_lock(dir);
- dentry = d_alloc_name(parent, name);
- if (!dentry)
- goto out;
- ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content);
- if (ret)
- dput(dentry);
-out:
+ inode = nfsd_get_inode(dir->i_sb, S_IFLNK | 0777);
+ if (!inode)
+ return;
+
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry)) {
+ iput(inode);
+ return;
+ }
+
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = (char *)content;
+ inode->i_size = strlen(content);
+
+ d_instantiate(dentry, inode);
+ fsnotify_create(dir, dentry);
inode_unlock(dir);
}
#else
@@ -1240,40 +1207,34 @@ struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
/* XXX: cut'n'paste from simple_fill_super; figure out if we could share
* code instead. */
-static int nfsdfs_create_files(struct dentry *root,
+static int nfsdfs_create_files(struct dentry *root,
const struct tree_descr *files,
struct nfsdfs_client *ncl,
struct dentry **fdentries)
{
struct inode *dir = d_inode(root);
- struct inode *inode;
struct dentry *dentry;
- int i;
- inode_lock(dir);
- for (i = 0; files->name && files->name[0]; i++, files++) {
- dentry = d_alloc_name(root, files->name);
- if (!dentry)
- goto out;
- inode = nfsd_get_inode(d_inode(root)->i_sb,
- S_IFREG | files->mode);
- if (!inode) {
- dput(dentry);
- goto out;
+ for (int i = 0; files->name && files->name[0]; i++, files++) {
+ struct inode *inode = nfsd_get_inode(root->d_sb,
+ S_IFREG | files->mode);
+ if (!inode)
+ return -ENOMEM;
+ dentry = simple_start_creating(root, files->name);
+ if (IS_ERR(dentry)) {
+ iput(inode);
+ return PTR_ERR(dentry);
}
kref_get(&ncl->cl_ref);
inode->i_fop = files->ops;
inode->i_private = ncl;
- d_add(dentry, inode);
+ d_instantiate(dentry, inode);
fsnotify_create(dir, dentry);
if (fdentries)
fdentries[i] = dentry;
+ inode_unlock(dir);
}
- inode_unlock(dir);
return 0;
-out:
- inode_unlock(dir);
- return -ENOMEM;
}
/* on success, returns positive number unique to that client. */
@@ -1993,7 +1954,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
* remaining listeners and recreate the list.
*/
if (delete)
- svc_xprt_destroy_all(serv, net);
+ svc_xprt_destroy_all(serv, net, false);
/* walk list of addrs again, open any that still don't exist */
nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 82b0111ac469..7057ddd7a0a8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net)
#endif
}
- svc_xprt_destroy_all(serv, net);
-
/*
* write_ports can create the server without actually starting
- * any threads--if we get shut down before any threads are
+ * any threads. If we get shut down before any threads are
* started, then nfsd_destroy_serv will be run before any of this
* other initialization has been done except the rpcb information.
*/
- svc_rpcb_cleanup(serv, net);
-
+ svc_xprt_destroy_all(serv, net, true);
nfsd_shutdown_net(net);
svc_destroy(&serv);
}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a664fdf1161e..6e2c8e2aab10 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc,
)
);
+TRACE_EVENT(nfsd_file_get_dio_attrs,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct kstat *stat
+ ),
+ TP_ARGS(inode, stat),
+ TP_STRUCT__entry(
+ __field(const void *, inode)
+ __field(unsigned long, mask)
+ __field(u32, mem_align)
+ __field(u32, offset_align)
+ __field(u32, read_offset_align)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->mask = stat->result_mask;
+ __entry->mem_align = stat->dio_mem_align;
+ __entry->offset_align = stat->dio_offset_align;
+ __entry->read_offset_align = stat->dio_read_offset_align;
+ ),
+ TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u",
+ __entry->inode, show_statx_mask(__entry->mask),
+ __entry->mem_align, __entry->offset_align,
+ __entry->read_offset_align
+ )
+);
+
TRACE_EVENT(nfsd_file_acquire,
TP_PROTO(
const struct svc_rqst *rqstp,
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index eff04959606f..fde3e0c11dba 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -185,6 +185,10 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
u32 request_mask = STATX_BASIC_STATS;
struct path p = {.mnt = fh->fh_export->ex_path.mnt,
.dentry = fh->fh_dentry};
+ struct inode *inode = d_inode(p.dentry);
+
+ if (S_ISREG(inode->i_mode))
+ request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN);
if (fh->fh_maxsize == NFS4_FHSIZE)
request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index b78308975082..39e60218df7c 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -441,7 +441,9 @@ struct fanotify_perm_event {
size_t count;
u32 response; /* userspace answer to the event */
unsigned short state; /* state of the event */
+ unsigned short watchdog_cnt; /* already scanned by watchdog? */
int fd; /* fd we passed to userspace for this event */
+ pid_t recv_pid; /* pid of task receiving the event */
union {
struct fanotify_response_info_header hdr;
struct fanotify_response_info_audit_rule audit_rule;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b192ee068a7a..1dadda82cae5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -50,6 +50,7 @@
/* configurable via /proc/sys/fs/fanotify/ */
static int fanotify_max_queued_events __read_mostly;
+static int perm_group_timeout __read_mostly;
#ifdef CONFIG_SYSCTL
@@ -85,6 +86,14 @@ static const struct ctl_table fanotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO
},
+ {
+ .procname = "watchdog_timeout",
+ .data = &perm_group_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
};
static void __init fanotify_sysctls_init(void)
@@ -95,6 +104,91 @@ static void __init fanotify_sysctls_init(void)
#define fanotify_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
+static LIST_HEAD(perm_group_list);
+static DEFINE_SPINLOCK(perm_group_lock);
+static void perm_group_watchdog(struct work_struct *work);
+static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog);
+
+static void perm_group_watchdog_schedule(void)
+{
+ schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout));
+}
+
+static void perm_group_watchdog(struct work_struct *work)
+{
+ struct fsnotify_group *group;
+ struct fanotify_perm_event *event;
+ struct task_struct *task;
+ pid_t failed_pid = 0;
+
+ guard(spinlock)(&perm_group_lock);
+ if (list_empty(&perm_group_list))
+ return;
+
+ list_for_each_entry(group, &perm_group_list,
+ fanotify_data.perm_grp_list) {
+ /*
+ * Ok to test without lock, racing with an addition is
+ * fine, will deal with it next round
+ */
+ if (list_empty(&group->fanotify_data.access_list))
+ continue;
+
+ spin_lock(&group->notification_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (likely(event->watchdog_cnt == 0)) {
+ event->watchdog_cnt = 1;
+ } else if (event->watchdog_cnt == 1) {
+ /* Report on event only once */
+ event->watchdog_cnt = 2;
+
+ /* Do not report same pid repeatedly */
+ if (event->recv_pid == failed_pid)
+ continue;
+
+ failed_pid = event->recv_pid;
+ rcu_read_lock();
+ task = find_task_by_pid_ns(event->recv_pid,
+ &init_pid_ns);
+ pr_warn_ratelimited(
+ "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n",
+ event->recv_pid,
+ task ? task->comm : NULL,
+ perm_group_timeout);
+ rcu_read_unlock();
+ }
+ }
+ spin_unlock(&group->notification_lock);
+ }
+ perm_group_watchdog_schedule();
+}
+
+static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group)
+{
+ if (!list_empty(&group->fanotify_data.perm_grp_list)) {
+ /* Perm event watchdog can no longer scan this group. */
+ spin_lock(&perm_group_lock);
+ list_del_init(&group->fanotify_data.perm_grp_list);
+ spin_unlock(&perm_group_lock);
+ }
+}
+
+static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group)
+{
+ if (!perm_group_timeout)
+ return;
+
+ spin_lock(&perm_group_lock);
+ if (list_empty(&group->fanotify_data.perm_grp_list)) {
+ /* Add to perm_group_list for monitoring by watchdog. */
+ if (list_empty(&perm_group_list))
+ perm_group_watchdog_schedule();
+ list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list);
+ }
+ spin_unlock(&perm_group_lock);
+}
+
/*
* All flags that may be specified in parameter event_f_flags of fanotify_init.
*
@@ -953,6 +1047,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
spin_lock(&group->notification_lock);
list_add_tail(&event->fse.list,
&group->fanotify_data.access_list);
+ FANOTIFY_PERM(event)->recv_pid = current->pid;
spin_unlock(&group->notification_lock);
}
}
@@ -1012,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
fsnotify_group_stop_queueing(group);
+ fanotify_perm_watchdog_group_remove(group);
+
/*
* Process all permission events on access_list and notification queue
* and simulate reply from userspace.
@@ -1465,6 +1562,10 @@ out:
fsnotify_group_unlock(group);
fsnotify_put_mark(fsn_mark);
+
+ if (!ret && (mask & FANOTIFY_PERM_EVENTS))
+ fanotify_perm_watchdog_group_add(group);
+
return ret;
}
@@ -1625,6 +1726,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.f_flags = event_f_flags;
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
+ INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list);
switch (class) {
case FAN_CLASS_NOTIF:
group->priority = FSNOTIFY_PRIO_NORMAL;
@@ -1999,7 +2101,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt->mnt_sb;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ ret = -EINVAL;
mntns = mnt_ns_from_dentry(path.dentry);
+ if (!mntns)
+ goto path_put_and_out;
user_ns = mntns->user_ns;
obj = mntns;
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index cd7d11b0eb08..7c326ec2e8a8 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -10,7 +10,7 @@
* Copyright 2006 Hewlett-Packard Development Company, L.P.
*
* Copyright (C) 2009 Eric Paris <Red Hat Inc>
- * inotify was largely rewriten to make use of the fsnotify infrastructure
+ * inotify was largely rewritten to make use of the fsnotify infrastructure
*/
#include <linux/dcache.h> /* d_unlinked */
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 04107b950717..65d05e6a0566 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -1371,6 +1371,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
mark_buffer_dirty(bh);
unlock_buffer(bh);
/* err = sync_dirty_buffer(bh); */
+ put_bh(bh);
b0 = 0;
bits -= op;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index c1ece707b195..4c90ec2fa2ea 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -49,6 +49,30 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
return 0;
}
+static int ntfs_ioctl_get_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf)
+{
+ if (copy_to_user(buf, sbi->volume.label, FSLABEL_MAX))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ntfs_ioctl_set_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf)
+{
+ u8 user[FSLABEL_MAX] = {0};
+ int len;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(user, buf, FSLABEL_MAX))
+ return -EFAULT;
+
+ len = strnlen(user, FSLABEL_MAX);
+
+ return ntfs_set_label(sbi, user, len);
+}
+
/*
* ntfs_ioctl - file_operations::unlocked_ioctl
*/
@@ -64,6 +88,10 @@ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
switch (cmd) {
case FITRIM:
return ntfs_ioctl_fitrim(sbi, arg);
+ case FS_IOC_GETFSLABEL:
+ return ntfs_ioctl_get_volume_label(sbi, (u8 __user *)arg);
+ case FS_IOC_SETFSLABEL:
+ return ntfs_ioctl_set_volume_label(sbi, (u8 __user *)arg);
}
return -ENOTTY; /* Inappropriate ioctl for device. */
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 1bf2a6593dec..6d1bf890929d 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1508,6 +1508,16 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size);
}
+ /*
+ * Index blocks exist, but $BITMAP has zero valid bits.
+ * This implies an on-disk corruption and must be rejected.
+ */
+ if (in->name == I30_NAME &&
+ unlikely(bmp_size_v == 0 && indx->alloc_run.count)) {
+ err = -EINVAL;
+ goto out1;
+ }
+
bit = bmp_size << 3;
}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 48b4f73a93ee..3959f23c487a 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -471,6 +471,7 @@ end_enum:
fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) {
/* Records in $Extend are not a files or general directories. */
inode->i_op = &ntfs_file_inode_operations;
+ mode = S_IFREG;
} else {
err = -EINVAL;
goto out;
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 1296e6fcc779..630128716ea7 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -280,7 +280,7 @@ struct ntfs_sb_info {
__le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY.
u8 major_ver;
u8 minor_ver;
- char label[256];
+ char label[FSLABEL_MAX];
bool real_dirty; // Real fs state.
} volume;
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index 6e86d66197ef..88550085f745 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -9,6 +9,7 @@
#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/log2.h>
+#include <linux/overflow.h>
#include "debug.h"
#include "ntfs.h"
@@ -982,14 +983,18 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
if (!dlcn)
return -EINVAL;
- lcn = prev_lcn + dlcn;
+
+ if (check_add_overflow(prev_lcn, dlcn, &lcn))
+ return -EINVAL;
prev_lcn = lcn;
} else {
/* The size of 'dlcn' can't be > 8. */
return -EINVAL;
}
- next_vcn = vcn64 + len;
+ if (check_add_overflow(vcn64, len, &next_vcn))
+ return -EINVAL;
+
/* Check boundary. */
if (next_vcn > evcn + 1)
return -EINVAL;
@@ -1153,7 +1158,8 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn)
return -EINVAL;
run_buf += size_size + offset_size;
- vcn64 += len;
+ if (check_add_overflow(vcn64, len, &vcn64))
+ return -EINVAL;
#ifndef CONFIG_NTFS3_64BIT_CLUSTER
if (vcn64 > 0x100000000ull)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 821cb7874685..162711cc5b20 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6928,8 +6928,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end,
out:
if (ret != 0) {
- if (folios)
- ocfs2_unlock_and_free_folios(folios, numfolios);
+ ocfs2_unlock_and_free_folios(folios, numfolios);
numfolios = 0;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 86bb1a03bcc1..4145e06d2c08 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1477,7 +1477,6 @@ way_up_top:
goto send_response;
} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
spin_unlock(&res->spinlock);
- // mlog(0, "node %u is the master\n", res->owner);
response = DLM_MASTER_RESP_NO;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
@@ -1493,7 +1492,6 @@ way_up_top:
BUG();
}
- // mlog(0, "lockres is in progress...\n");
spin_lock(&dlm->master_lock);
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
@@ -1503,8 +1501,6 @@ way_up_top:
set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->type == DLM_MLE_BLOCK) {
- // mlog(0, "this node is waiting for "
- // "lockres to be mastered\n");
response = DLM_MASTER_RESP_NO;
} else if (tmpmle->type == DLM_MLE_MIGRATION) {
mlog(0, "node %u is master, but trying to migrate to "
@@ -1531,8 +1527,6 @@ way_up_top:
} else
response = DLM_MASTER_RESP_NO;
} else {
- // mlog(0, "this node is attempting to "
- // "master lockres\n");
response = DLM_MASTER_RESP_MAYBE;
}
if (set_maybe)
@@ -1559,7 +1553,6 @@ way_up_top:
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
/* this lockid has never been seen on this node yet */
- // mlog(0, "no mle found\n");
if (!mle) {
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -1573,8 +1566,6 @@ way_up_top:
goto way_up_top;
}
- // mlog(0, "this is second time thru, already allocated, "
- // "add the block.\n");
dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
set_bit(request->node_idx, mle->maybe_map);
__dlm_insert_mle(dlm, mle);
@@ -1897,8 +1888,6 @@ ok:
spin_unlock(&res->spinlock);
}
- // mlog(0, "woo! got an assert_master from node %u!\n",
- // assert->node_idx);
if (mle) {
int extra_ref = 0;
int nn = -1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 00f52812dbb0..843ee02bd85f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -464,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
- // mlog(0, "nothing to recover! sleeping now!\n");
spin_unlock(&dlm->spinlock);
/* return to main thread loop and sleep. */
return 0;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 6c4f78f473fb..fcc89856ab95 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1495,6 +1495,14 @@ int ocfs2_validate_inode_block(struct super_block *sb,
goto bail;
}
+ if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
+ (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) {
+ rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(di->i_suballoc_slot));
+ goto bail;
+ }
+
rc = 0;
bail:
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index db14c92302a1..b6864602814c 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -358,13 +358,11 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
goto bail;
}
} else {
- ocfs2_sprintf_system_inode_name(namebuf,
- sizeof(namebuf),
- type, i);
+ int len = ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, i);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
- namebuf,
- strlen(namebuf),
- &blkno);
+ namebuf, len, &blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
@@ -651,12 +649,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
goto bail;
}
} else {
- ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
- OCFS2_INVALID_SLOT);
+ int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf),
+ type, OCFS2_INVALID_SLOT);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
- namebuf,
- strlen(namebuf),
- &blkno);
+ namebuf, len, &blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index cbe2f8ed8897..86f2631e6360 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -364,7 +364,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
int *vict_bit,
struct buffer_head **ret_bh)
{
- int ret, i, bits_per_unit = 0;
+ int ret, i, len, bits_per_unit = 0;
u64 blkno;
char namebuf[40];
@@ -375,9 +375,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
struct ocfs2_dinode *ac_dinode;
struct ocfs2_group_desc *bg;
- ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
- ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
- strlen(namebuf), &blkno);
+ len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+ ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno);
+
if (ret) {
ret = -ENOENT;
goto out;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e8e94599e907..ae0e44e5f2ad 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -614,7 +614,7 @@ struct ocfs2_super_block {
__le16 s_reserved0;
__le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
* s_uuid_hash serves as seed[3]. */
-/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
+/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */
/*140*/
/*
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 765105f1ff8a..be0a5758bd40 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1011,6 +1011,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
printk(KERN_ERR "ocfs2: Could not determine"
" locking version\n");
user_cluster_disconnect(conn);
+ lc = NULL;
goto out;
}
wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 53a945da873b..d53a6cc866be 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -127,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
char namebuf[40];
struct inode *inode = NULL;
u64 blkno;
- int status = 0;
+ int len, status = 0;
- ocfs2_sprintf_system_inode_name(namebuf,
- sizeof(namebuf),
- type, slot);
+ len = ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, slot);
- status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
- strlen(namebuf), &blkno);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf, len, &blkno);
if (status < 0) {
goto bail;
}
diff --git a/fs/open.c b/fs/open.c
index 9655158c3885..4890b13461c7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1059,18 +1059,20 @@ EXPORT_SYMBOL(finish_open);
* finish_no_open - finish ->atomic_open() without opening the file
*
* @file: file pointer
- * @dentry: dentry or NULL (as returned from ->lookup())
+ * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup())
*
- * This can be used to set the result of a successful lookup in ->atomic_open().
+ * This can be used to set the result of a lookup in ->atomic_open().
*
* NB: unlike finish_open() this function does consume the dentry reference and
* the caller need not dput() it.
*
- * Returns "0" which must be the return value of ->atomic_open() after having
- * called this function.
+ * Returns 0 or -E..., which must be the return value of ->atomic_open() after
+ * having called this function.
*/
int finish_no_open(struct file *file, struct dentry *dentry)
{
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
file->f_path.dentry = dentry;
return 0;
}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 82395fe2b956..bec5475de094 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -38,8 +38,7 @@ static int orangefs_create(struct mnt_idmap *idmap,
new_op->upcall.req.create.parent_refn = parent->refn;
- fill_default_sys_attrs(new_op->upcall.req.create.attributes,
- ORANGEFS_TYPE_METAFILE, mode);
+ fill_default_sys_attrs(new_op->upcall.req.create.attributes, mode);
strscpy(new_op->upcall.req.create.d_name, dentry->d_name.name);
@@ -240,9 +239,7 @@ static int orangefs_symlink(struct mnt_idmap *idmap,
new_op->upcall.req.sym.parent_refn = parent->refn;
- fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
- ORANGEFS_TYPE_SYMLINK,
- mode);
+ fill_default_sys_attrs(new_op->upcall.req.sym.attributes, mode);
strscpy(new_op->upcall.req.sym.entry_name, dentry->d_name.name);
strscpy(new_op->upcall.req.sym.target, symname);
@@ -316,8 +313,7 @@ static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
new_op->upcall.req.mkdir.parent_refn = parent->refn;
- fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
- ORANGEFS_TYPE_DIRECTORY, mode);
+ fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, mode);
strscpy(new_op->upcall.req.mkdir.d_name, dentry->d_name.name);
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 1c375fb65018..79267b3419f2 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -440,14 +440,13 @@ static ssize_t orangefs_debug_write(struct file *file,
count = ORANGEFS_MAX_DEBUG_STRING_LEN;
}
- buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!buf)
- goto out;
-
- if (copy_from_user(buf, ubuf, count - 1)) {
+ buf = memdup_user_nul(ubuf, count - 1);
+ if (IS_ERR(buf)) {
gossip_debug(GOSSIP_DEBUGFS_DEBUG,
- "%s: copy_from_user failed!\n",
+ "%s: memdup_user_nul failed!\n",
__func__);
+ rc = PTR_ERR(buf);
+ buf = NULL;
goto out;
}
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 3e153c2f6b82..29c6da43e396 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -462,7 +462,7 @@ int service_operation(struct orangefs_kernel_op_s *op,
((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \
ORANGEFS_OP_INTERRUPTIBLE : 0)
-#define fill_default_sys_attrs(sys_attr, type, mode) \
+#define fill_default_sys_attrs(sys_attr, mode) \
do { \
sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \
sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 74ef75586f38..eee3c5ed1bbb 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -54,7 +54,9 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags)
static unsigned int xattr_key(const char *key)
{
unsigned int i = 0;
- while (key)
+ if (!key)
+ return 0;
+ while (*key)
i += *key++;
return i % 16;
}
@@ -175,8 +177,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
cx->length = -1;
cx->timeout = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
- hash_add(orangefs_inode->xattr_cache, &cx->node,
- xattr_key(cx->key));
+ hlist_add_head( &cx->node,
+ &orangefs_inode->xattr_cache[xattr_key(cx->key)]);
}
}
goto out_release_op;
@@ -229,8 +231,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
memcpy(cx->val, buffer, length);
cx->length = length;
cx->timeout = jiffies + HZ;
- hash_add(orangefs_inode->xattr_cache, &cx->node,
- xattr_key(cx->key));
+ hlist_add_head(&cx->node,
+ &orangefs_inode->xattr_cache[xattr_key(cx->key)]);
}
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 27396fe63f6d..66bd43a99d2e 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -670,7 +670,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
if (err)
return err;
- if (inode->i_flags & OVL_COPY_I_FLAGS_MASK &&
+ if (inode->i_flags & OVL_FATTR_I_FLAGS_MASK &&
(S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) {
/*
* Copy the fileattr inode flags that are the source of already
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index dbd63a74df4b..a5e9ddf3023b 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -187,6 +187,13 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
/* mkdir is special... */
newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode);
err = PTR_ERR_OR_ZERO(newdentry);
+ /* expect to inherit casefolding from workdir/upperdir */
+ if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) {
+ pr_warn_ratelimited("wrong inherited casefold (%pd2)\n",
+ newdentry);
+ dput(newdentry);
+ err = -EINVAL;
+ }
break;
case S_IFCHR:
@@ -205,12 +212,32 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
err = -EPERM;
}
}
- if (!err && WARN_ON(!newdentry->d_inode)) {
+ if (err)
+ goto out;
+
+ if (WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -EIO;
+ } else if (d_unhashed(newdentry)) {
+ struct dentry *d;
+ /*
+ * Some filesystems (i.e. casefolded) may return an unhashed
+ * negative dentry from the ovl_lookup_upper() call before
+ * ovl_create_real().
+ * In that case, lookup again after making the newdentry
+ * positive, so ovl_create_upper() always returns a hashed
+ * positive dentry.
+ */
+ d = ovl_lookup_upper(ofs, newdentry->d_name.name, parent,
+ newdentry->d_name.len);
+ dput(newdentry);
+ if (IS_ERR_OR_NULL(d))
+ err = d ? PTR_ERR(d) : -ENOENT;
+ else
+ return d;
}
out:
if (err) {
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index ecb9f2019395..aaa4cf579561 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -1277,6 +1277,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
}
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
ovl_inode_init(inode, oip, ino, fsid);
+ WARN_ON_ONCE(!!IS_CASEFOLDED(inode) != ofs->casefold);
if (upperdentry && ovl_is_impuredir(sb, upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 76d6248b625e..e93bcc5727bc 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -239,13 +239,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
char val;
/*
- * We allow filesystems that are case-folding capable but deny composing
- * ovl stack from case-folded directories. If someone has enabled case
- * folding on a directory on underlying layer, the warranty of the ovl
- * stack is voided.
+ * We allow filesystems that are case-folding capable as long as the
+ * layers are consistently enabled in the stack, enabled for every dir
+ * or disabled in all dirs. If someone has modified case folding on a
+ * directory on underlying layer, the warranty of the ovl stack is
+ * voided.
*/
- if (ovl_dentry_casefolded(base)) {
- warn = "case folded parent";
+ if (ofs->casefold != ovl_dentry_casefolded(base)) {
+ warn = "parent wrong casefold";
err = -ESTALE;
goto out_warn;
}
@@ -259,8 +260,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto out_err;
}
- if (ovl_dentry_casefolded(this)) {
- warn = "case folded child";
+ if (ofs->casefold != ovl_dentry_casefolded(this)) {
+ warn = "child wrong casefold";
err = -EREMOTE;
goto out_warn;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 4f84abaa0d68..6fd4fbfb0908 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -820,10 +820,12 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct ovl_inode_params *oip);
void ovl_copyattr(struct inode *to);
+/* vfs fileattr flags read from overlay.protattr xattr to ovl inode */
+#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE)
+/* vfs fileattr flags copied from real to ovl inode */
+#define OVL_FATTR_I_FLAGS_MASK (OVL_PROT_I_FLAGS_MASK | S_SYNC | S_NOATIME)
/* vfs inode flags copied from real to ovl inode */
-#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE)
-/* vfs inode flags read from overlay.protattr xattr to ovl inode */
-#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE)
+#define OVL_COPY_I_FLAGS_MASK (OVL_FATTR_I_FLAGS_MASK | S_CASEFOLD)
/*
* fileattr flags copied from lower to upper inode on copy up.
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 4c1bae935ced..1d4828dbcf7a 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -91,6 +91,7 @@ struct ovl_fs {
struct mutex whiteout_lock;
/* r/o snapshot of upperdir sb's only taken on volatile mounts */
errseq_t errseq;
+ bool casefold;
};
/* Number of lower layers, not including data-only layers */
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index f4e7fff909ac..63b7346c5ee1 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -276,17 +276,26 @@ static int ovl_mount_dir(const char *name, struct path *path)
static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
enum ovl_opt layer, const char *name, bool upper)
{
+ bool is_casefolded = ovl_dentry_casefolded(path->dentry);
struct ovl_fs_context *ctx = fc->fs_private;
+ struct ovl_fs *ofs = fc->s_fs_info;
if (!d_is_dir(path->dentry))
return invalfc(fc, "%s is not a directory", name);
/*
* Allow filesystems that are case-folding capable but deny composing
- * ovl stack from case-folded directories.
+ * ovl stack from inconsistent case-folded directories.
*/
- if (ovl_dentry_casefolded(path->dentry))
- return invalfc(fc, "case-insensitive directory on %s not supported", name);
+ if (!ctx->casefold_set) {
+ ofs->casefold = is_casefolded;
+ ctx->casefold_set = true;
+ }
+
+ if (ofs->casefold != is_casefolded) {
+ return invalfc(fc, "case-%ssensitive directory on %s is inconsistent",
+ is_casefolded ? "in" : "", name);
+ }
if (ovl_dentry_weird(path->dentry))
return invalfc(fc, "filesystem on %s not supported", name);
diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h
index c96d93982021..ffd53cdd8482 100644
--- a/fs/overlayfs/params.h
+++ b/fs/overlayfs/params.h
@@ -33,6 +33,7 @@ struct ovl_fs_context {
struct ovl_opt_set set;
struct ovl_fs_context_layer *lower;
char *lowerdir_all; /* user provided lowerdir string */
+ bool casefold_set;
};
int ovl_init_fs_context(struct fs_context *fc);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 15cb06fa0c9a..1e9792cc557b 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -27,6 +27,8 @@ struct ovl_cache_entry {
bool is_upper;
bool is_whiteout;
bool check_xwhiteout;
+ const char *c_name;
+ int c_len;
char name[];
};
@@ -45,6 +47,7 @@ struct ovl_readdir_data {
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
+ struct unicode_map *map;
int count;
int err;
bool is_upper;
@@ -66,6 +69,31 @@ static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
return rb_entry(n, struct ovl_cache_entry, node);
}
+static int ovl_casefold(struct ovl_readdir_data *rdd, const char *str, int len,
+ char **dst)
+{
+ const struct qstr qstr = { .name = str, .len = len };
+ char *cf_name;
+ int cf_len;
+
+ if (!IS_ENABLED(CONFIG_UNICODE) || !rdd->map || is_dot_dotdot(str, len))
+ return 0;
+
+ cf_name = kmalloc(NAME_MAX, GFP_KERNEL);
+ if (!cf_name) {
+ rdd->err = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ cf_len = utf8_casefold(rdd->map, &qstr, cf_name, NAME_MAX);
+ if (cf_len > 0)
+ *dst = cf_name;
+ else
+ kfree(cf_name);
+
+ return cf_len;
+}
+
static bool ovl_cache_entry_find_link(const char *name, int len,
struct rb_node ***link,
struct rb_node **parent)
@@ -79,10 +107,10 @@ static bool ovl_cache_entry_find_link(const char *name, int len,
*parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
- cmp = strncmp(name, tmp->name, len);
+ cmp = strncmp(name, tmp->c_name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
- else if (cmp < 0 || len < tmp->len)
+ else if (cmp < 0 || len < tmp->c_len)
newp = &tmp->node.rb_left;
else
found = true;
@@ -101,10 +129,10 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
- cmp = strncmp(name, p->name, len);
+ cmp = strncmp(name, p->c_name, len);
if (cmp > 0)
node = p->node.rb_right;
- else if (cmp < 0 || len < p->len)
+ else if (cmp < 0 || len < p->c_len)
node = p->node.rb_left;
else
return p;
@@ -145,6 +173,7 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
+ const char *c_name, int c_len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
@@ -167,6 +196,14 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
/* Defer check for overlay.whiteout to ovl_iterate() */
p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;
+ if (c_name && c_name != name) {
+ p->c_name = c_name;
+ p->c_len = c_len;
+ } else {
+ p->c_name = p->name;
+ p->c_len = len;
+ }
+
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
@@ -174,48 +211,62 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
return p;
}
-static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
- const char *name, int len, u64 ino,
+/* Return 0 for found, 1 for added, <0 for error */
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+ const char *name, int len,
+ const char *c_name, int c_len,
+ u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root->rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
- if (ovl_cache_entry_find_link(name, len, &newp, &parent))
- return true;
+ if (ovl_cache_entry_find_link(c_name, c_len, &newp, &parent))
+ return 0;
- p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, len, c_name, c_len, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return false;
+ return -ENOMEM;
}
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, rdd->root);
- return true;
+ return 1;
}
-static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
+/* Return 0 for found, 1 for added, <0 for error */
+static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
+ const char *c_name, int c_len,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
- p = ovl_cache_entry_find(rdd->root, name, namelen);
+ p = ovl_cache_entry_find(rdd->root, c_name, c_len);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
+ return 0;
} else {
- p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, namelen, c_name, c_len,
+ ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
- return rdd->err == 0;
+ return rdd->err ?: 1;
+}
+
+static void ovl_cache_entry_free(struct ovl_cache_entry *p)
+{
+ if (p->c_name != p->name)
+ kfree(p->c_name);
+ kfree(p);
}
void ovl_cache_free(struct list_head *list)
@@ -224,7 +275,7 @@ void ovl_cache_free(struct list_head *list)
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
- kfree(p);
+ ovl_cache_entry_free(p);
INIT_LIST_HEAD(list);
}
@@ -260,12 +311,39 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
+ struct ovl_fs *ofs = OVL_FS(rdd->dentry->d_sb);
+ const char *c_name = NULL;
+ char *cf_name = NULL;
+ int c_len = 0, ret;
+
+ if (ofs->casefold)
+ c_len = ovl_casefold(rdd, name, namelen, &cf_name);
+
+ if (rdd->err)
+ return false;
+
+ if (c_len <= 0) {
+ c_name = name;
+ c_len = namelen;
+ } else {
+ c_name = cf_name;
+ }
rdd->count++;
if (!rdd->is_lowest)
- return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+ ret = ovl_cache_entry_add_rb(rdd, name, namelen, c_name, c_len, ino, d_type);
else
- return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
+ ret = ovl_fill_lowest(rdd, name, namelen, c_name, c_len, offset, ino, d_type);
+
+ /*
+ * If ret == 1, that means that c_name is being used as part of struct
+ * ovl_cache_entry and will be freed at ovl_cache_free(). Otherwise,
+ * c_name was found in the rb-tree so we can free it here.
+ */
+ if (ret != 1 && c_name != name)
+ kfree(c_name);
+
+ return ret >= 0;
}
static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
@@ -357,12 +435,18 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
.list = list,
.root = root,
.is_lowest = false,
+ .map = NULL,
};
int idx, next;
const struct ovl_layer *layer;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath, &layer);
+
+ if (ofs->casefold)
+ rdd.map = sb_encoding(realpath.dentry->d_sb);
+
rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
ovl_dentry_has_xwhiteouts(dentry);
@@ -555,7 +639,7 @@ static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
- p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, namelen, NULL, 0, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
return false;
@@ -595,7 +679,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
}
if (p->ino == p->real_ino) {
list_del(&p->l_node);
- kfree(p);
+ ovl_cache_entry_free(p);
} else {
struct rb_node **newp = &root->rb_node;
struct rb_node *parent = NULL;
@@ -1023,7 +1107,7 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
del_entry:
list_del(&p->l_node);
- kfree(p);
+ ovl_cache_entry_free(p);
}
return err;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index bd3d7ba8fb95..34875c46dbe2 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -161,6 +161,16 @@ static const struct dentry_operations ovl_dentry_operations = {
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
+#if IS_ENABLED(CONFIG_UNICODE)
+static const struct dentry_operations ovl_dentry_ci_operations = {
+ .d_real = ovl_d_real,
+ .d_revalidate = ovl_dentry_revalidate,
+ .d_weak_revalidate = ovl_dentry_weak_revalidate,
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+};
+#endif
+
static struct kmem_cache *ovl_inode_cachep;
static struct inode *ovl_alloc_inode(struct super_block *sb)
@@ -991,6 +1001,25 @@ static int ovl_get_data_fsid(struct ovl_fs *ofs)
return ofs->numfs;
}
+/*
+ * Set the ovl sb encoding as the same one used by the first layer
+ */
+static int ovl_set_encoding(struct super_block *sb, struct super_block *fs_sb)
+{
+ if (!sb_has_encoding(fs_sb))
+ return 0;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (sb_has_strict_encoding(fs_sb)) {
+ pr_err("strict encoding not supported\n");
+ return -EINVAL;
+ }
+
+ sb->s_encoding = fs_sb->s_encoding;
+ sb->s_encoding_flags = fs_sb->s_encoding_flags;
+#endif
+ return 0;
+}
static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
struct ovl_fs_context *ctx, struct ovl_layer *layers)
@@ -1024,6 +1053,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
if (ovl_upper_mnt(ofs)) {
ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb;
ofs->fs[0].is_lower = false;
+
+ if (ofs->casefold) {
+ err = ovl_set_encoding(sb, ofs->fs[0].sb);
+ if (err)
+ return err;
+ }
}
nr_merged_lower = ctx->nr - ctx->nr_data;
@@ -1083,6 +1118,19 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
l->name = NULL;
ofs->numlayer++;
ofs->fs[fsid].is_lower = true;
+
+ if (ofs->casefold) {
+ if (!ovl_upper_mnt(ofs) && !sb_has_encoding(sb)) {
+ err = ovl_set_encoding(sb, ofs->fs[fsid].sb);
+ if (err)
+ return err;
+ }
+
+ if (!sb_same_encoding(sb, mnt->mnt_sb)) {
+ pr_err("all layers must have the same encoding\n");
+ return -EINVAL;
+ }
+ }
}
/*
@@ -1300,6 +1348,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
ovl_dentry_set_flag(OVL_E_CONNECTED, root);
ovl_set_upperdata(d_inode(root));
ovl_inode_init(d_inode(root), &oip, ino, fsid);
+ WARN_ON(!!IS_CASEFOLDED(d_inode(root)) != ofs->casefold);
ovl_dentry_init_flags(root, upperdentry, oe, DCACHE_OP_WEAK_REVALIDATE);
/* root keeps a reference of upperdentry */
dget(upperdentry);
@@ -1307,6 +1356,19 @@ static struct dentry *ovl_get_root(struct super_block *sb,
return root;
}
+static void ovl_set_d_op(struct super_block *sb)
+{
+#if IS_ENABLED(CONFIG_UNICODE)
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ if (ofs->casefold) {
+ set_default_d_op(sb, &ovl_dentry_ci_operations);
+ return;
+ }
+#endif
+ set_default_d_op(sb, &ovl_dentry_operations);
+}
+
int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ovl_fs *ofs = sb->s_fs_info;
@@ -1322,7 +1384,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (WARN_ON(fc->user_ns != current_user_ns()))
goto out_err;
- set_default_d_op(sb, &ovl_dentry_operations);
+ ovl_set_d_op(sb);
err = -ENOMEM;
if (!ofs->creator_cred)
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 41033bac96cb..bec4a39d1b97 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -210,11 +210,11 @@ bool ovl_dentry_weird(struct dentry *dentry)
return true;
/*
- * Allow filesystems that are case-folding capable but deny composing
- * ovl stack from case-folded directories.
+ * Exceptionally for layers with casefold, we accept that they have
+ * their own hash and compare operations
*/
if (sb_has_encoding(dentry->d_sb))
- return IS_CASEFOLDED(d_inode(dentry));
+ return false;
return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE);
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 6f7d02f3fa98..5d91c3e58d2a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -29,6 +29,7 @@ static inline struct mount *next_slave(struct mount *p)
return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
+/* locks: namespace_shared && is_mounted(mnt) */
static struct mount *get_peer_under_root(struct mount *mnt,
struct mnt_namespace *ns,
const struct path *root)
@@ -50,7 +51,7 @@ static struct mount *get_peer_under_root(struct mount *mnt,
* Get ID of closest dominating peer group having a representative
* under the given root.
*
- * Caller must hold namespace_sem
+ * locks: namespace_shared
*/
int get_dominating_id(struct mount *mnt, const struct path *root)
{
@@ -70,19 +71,6 @@ static inline bool will_be_unmounted(struct mount *m)
return m->mnt.mnt_flags & MNT_UMOUNT;
}
-static struct mount *propagation_source(struct mount *mnt)
-{
- do {
- struct mount *m;
- for (m = next_peer(mnt); m != mnt; m = next_peer(m)) {
- if (!will_be_unmounted(m))
- return m;
- }
- mnt = mnt->mnt_master;
- } while (mnt && will_be_unmounted(mnt));
- return mnt;
-}
-
static void transfer_propagation(struct mount *mnt, struct mount *to)
{
struct hlist_node *p = NULL, *n;
@@ -111,11 +99,10 @@ void change_mnt_propagation(struct mount *mnt, int type)
return;
}
if (IS_MNT_SHARED(mnt)) {
- if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list))
- m = propagation_source(mnt);
if (list_empty(&mnt->mnt_share)) {
mnt_release_group_id(mnt);
} else {
+ m = next_peer(mnt);
list_del_init(&mnt->mnt_share);
mnt->mnt_group_id = 0;
}
@@ -136,6 +123,57 @@ void change_mnt_propagation(struct mount *mnt, int type)
}
}
+static struct mount *trace_transfers(struct mount *m)
+{
+ while (1) {
+ struct mount *next = next_peer(m);
+
+ if (next != m) {
+ list_del_init(&m->mnt_share);
+ m->mnt_group_id = 0;
+ m->mnt_master = next;
+ } else {
+ if (IS_MNT_SHARED(m))
+ mnt_release_group_id(m);
+ next = m->mnt_master;
+ }
+ hlist_del_init(&m->mnt_slave);
+ CLEAR_MNT_SHARED(m);
+ SET_MNT_MARK(m);
+
+ if (!next || !will_be_unmounted(next))
+ return next;
+ if (IS_MNT_MARKED(next))
+ return next->mnt_master;
+ m = next;
+ }
+}
+
+static void set_destinations(struct mount *m, struct mount *master)
+{
+ struct mount *next;
+
+ while ((next = m->mnt_master) != master) {
+ m->mnt_master = master;
+ m = next;
+ }
+}
+
+void bulk_make_private(struct list_head *set)
+{
+ struct mount *m;
+
+ list_for_each_entry(m, set, mnt_list)
+ if (!IS_MNT_MARKED(m))
+ set_destinations(m, trace_transfers(m));
+
+ list_for_each_entry(m, set, mnt_list) {
+ transfer_propagation(m, m->mnt_master);
+ m->mnt_master = NULL;
+ CLEAR_MNT_MARK(m);
+ }
+}
+
static struct mount *__propagation_next(struct mount *m,
struct mount *origin)
{
@@ -304,9 +342,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
err = PTR_ERR(this);
break;
}
- read_seqlock_excl(&mount_lock);
- mnt_set_mountpoint(n, dest_mp, this);
- read_sequnlock_excl(&mount_lock);
+ scoped_guard(mount_locked_reader)
+ mnt_set_mountpoint(n, dest_mp, this);
if (n->mnt_master)
SET_MNT_MARK(n->mnt_master);
copy = this;
diff --git a/fs/pnode.h b/fs/pnode.h
index 00ab153e3e9d..b029db225f33 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -42,6 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2)
}
void change_mnt_propagation(struct mount *, int);
+void bulk_make_private(struct list_head *);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
void propagate_umount(struct list_head *);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b997ceef9135..6299878e3d97 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3947,7 +3947,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
tid = task_pid_nr_ns(task, ns);
if (!tid)
continue; /* The task has just exited. */
- len = snprintf(name, sizeof(name), "%u", tid);
+ len = snprintf(name, sizeof(name), "%d", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index afa15a214538..6c4a6ee1fa2b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;
+/* workqueue for work quota_release_work*/
+static struct workqueue_struct *quota_unbound_wq;
+
void register_quota_format(struct quota_format_type *fmt)
{
spin_lock(&dq_list_lock);
@@ -881,7 +884,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_dfl_wq, &quota_release_work, 1);
+ queue_delayed_work(quota_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -3041,6 +3044,11 @@ static int __init dquot_init(void)
shrinker_register(dqcache_shrinker);
+ quota_unbound_wq = alloc_workqueue("quota_events_unbound",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
+ if (!quota_unbound_wq)
+ panic("Cannot create quota_unbound_wq\n");
+
return 0;
}
fs_initcall(dquot_init);
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 9f05f94e265a..a4c02199fef4 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -15,6 +15,7 @@ config CIFS
select CRYPTO_GCM
select CRYPTO_ECB
select CRYPTO_AES
+ select CRYPTO_LIB_ARC4
select KEYS
select DNS_RESOLVER
select ASN1
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index b69daeb1301b..b36f9f9340f0 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -36,9 +36,8 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
* fully cached or it may be in the process of
* being deleted due to a lease break.
*/
- if (!cfid->time || !cfid->has_lease) {
+ if (!is_valid_cached_dir(cfid))
return NULL;
- }
kref_get(&cfid->refcount);
return cfid;
}
@@ -194,7 +193,7 @@ replay_again:
* Otherwise, it is either a new entry or laundromat worker removed it
* from @cfids->entries. Caller will put last reference if the latter.
*/
- if (cfid->has_lease && cfid->time) {
+ if (is_valid_cached_dir(cfid)) {
cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
*ret_cfid = cfid;
@@ -233,7 +232,7 @@ replay_again:
list_for_each_entry(parent_cfid, &cfids->entries, entry) {
if (parent_cfid->dentry == dentry->d_parent) {
cifs_dbg(FYI, "found a parent cached file handle\n");
- if (parent_cfid->has_lease && parent_cfid->time) {
+ if (is_valid_cached_dir(parent_cfid)) {
lease_flags
|= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
memcpy(pfid->parent_lease_key,
@@ -417,12 +416,18 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
if (cfids == NULL)
return -EOPNOTSUPP;
+ if (!dentry)
+ return -ENOENT;
+
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry(cfid, &cfids->entries, entry) {
- if (dentry && cfid->dentry == dentry) {
+ if (cfid->dentry == dentry) {
+ if (!is_valid_cached_dir(cfid))
+ break;
cifs_dbg(FYI, "found a cached file handle by dentry\n");
kref_get(&cfid->refcount);
*ret_cfid = cfid;
+ cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
return 0;
}
@@ -522,10 +527,9 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
spin_unlock(&cifs_sb->tlink_tree_lock);
goto done;
}
- spin_lock(&cfid->fid_lock);
+
tmp_list->dentry = cfid->dentry;
cfid->dentry = NULL;
- spin_unlock(&cfid->fid_lock);
list_add_tail(&tmp_list->entry, &entry);
}
@@ -608,14 +612,9 @@ static void cached_dir_put_work(struct work_struct *work)
{
struct cached_fid *cfid = container_of(work, struct cached_fid,
put_work);
- struct dentry *dentry;
-
- spin_lock(&cfid->fid_lock);
- dentry = cfid->dentry;
+ dput(cfid->dentry);
cfid->dentry = NULL;
- spin_unlock(&cfid->fid_lock);
- dput(dentry);
queue_work(serverclose_wq, &cfid->close_work);
}
@@ -673,7 +672,6 @@ static struct cached_fid *init_cached_dir(const char *path)
INIT_LIST_HEAD(&cfid->entry);
INIT_LIST_HEAD(&cfid->dirents.entries);
mutex_init(&cfid->dirents.de_mutex);
- spin_lock_init(&cfid->fid_lock);
kref_init(&cfid->refcount);
return cfid;
}
@@ -697,6 +695,21 @@ static void free_cached_dir(struct cached_fid *cfid)
kfree(dirent);
}
+ /* adjust tcon-level counters and reset per-dir accounting */
+ if (cfid->cfids) {
+ if (cfid->dirents.entries_count)
+ atomic_long_sub((long)cfid->dirents.entries_count,
+ &cfid->cfids->total_dirents_entries);
+ if (cfid->dirents.bytes_used) {
+ atomic64_sub((long long)cfid->dirents.bytes_used,
+ &cfid->cfids->total_dirents_bytes);
+ atomic64_sub((long long)cfid->dirents.bytes_used,
+ &cifs_dircache_bytes_used);
+ }
+ }
+ cfid->dirents.entries_count = 0;
+ cfid->dirents.bytes_used = 0;
+
kfree(cfid->path);
cfid->path = NULL;
kfree(cfid);
@@ -725,7 +738,6 @@ static void cfids_laundromat_worker(struct work_struct *work)
{
struct cached_fids *cfids;
struct cached_fid *cfid, *q;
- struct dentry *dentry;
LIST_HEAD(entry);
cfids = container_of(work, struct cached_fids, laundromat_work.work);
@@ -752,12 +764,9 @@ static void cfids_laundromat_worker(struct work_struct *work)
list_for_each_entry_safe(cfid, q, &entry, entry) {
list_del(&cfid->entry);
- spin_lock(&cfid->fid_lock);
- dentry = cfid->dentry;
+ dput(cfid->dentry);
cfid->dentry = NULL;
- spin_unlock(&cfid->fid_lock);
- dput(dentry);
if (cfid->is_open) {
spin_lock(&cifs_tcp_ses_lock);
++cfid->tcon->tc_count;
@@ -792,6 +801,9 @@ struct cached_fids *init_cached_dirs(void)
queue_delayed_work(cfid_put_wq, &cfids->laundromat_work,
dir_cache_timeout * HZ);
+ atomic_long_set(&cfids->total_dirents_entries, 0);
+ atomic64_set(&cfids->total_dirents_bytes, 0);
+
return cfids;
}
diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h
index 46b5a2fdf15b..31339dc32719 100644
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -27,6 +27,9 @@ struct cached_dirents {
struct mutex de_mutex;
loff_t pos; /* Expected ctx->pos */
struct list_head entries;
+ /* accounting for cached entries in this directory */
+ unsigned long entries_count;
+ unsigned long bytes_used;
};
struct cached_fid {
@@ -41,7 +44,6 @@ struct cached_fid {
unsigned long last_access_time; /* jiffies of when last accessed */
struct kref refcount;
struct cifs_fid fid;
- spinlock_t fid_lock;
struct cifs_tcon *tcon;
struct dentry *dentry;
struct work_struct put_work;
@@ -62,8 +64,20 @@ struct cached_fids {
struct list_head dying;
struct work_struct invalidation_work;
struct delayed_work laundromat_work;
+ /* aggregate accounting for all cached dirents under this tcon */
+ atomic_long_t total_dirents_entries;
+ atomic64_t total_dirents_bytes;
};
+/* Module-wide directory cache accounting (defined in cifsfs.c) */
+extern atomic64_t cifs_dircache_bytes_used; /* bytes across all mounts */
+
+static inline bool
+is_valid_cached_dir(struct cached_fid *cfid)
+{
+ return cfid->time && cfid->has_lease;
+}
+
extern struct cached_fids *init_cached_dirs(void);
extern void free_cached_dirs(struct cached_fids *cfids);
extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 35c4d27d2cc0..1fb71d2d31b5 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -240,14 +240,18 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsFileInfo *cfile;
+ struct inode *inode;
+ struct cifsInodeInfo *cinode;
+ char lease[4];
+ int n;
seq_puts(m, "# Version:1\n");
seq_puts(m, "# Format:\n");
seq_puts(m, "# <tree id> <ses id> <persistent fid> <flags> <count> <pid> <uid>");
#ifdef CONFIG_CIFS_DEBUG2
- seq_printf(m, " <filename> <mid>\n");
+ seq_puts(m, " <filename> <lease> <mid>\n");
#else
- seq_printf(m, " <filename>\n");
+ seq_puts(m, " <filename> <lease>\n");
#endif /* CIFS_DEBUG2 */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
@@ -267,11 +271,30 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
cfile->pid,
from_kuid(&init_user_ns, cfile->uid),
cfile->dentry);
+
+ /* Append lease/oplock caching state as RHW letters */
+ inode = d_inode(cfile->dentry);
+ n = 0;
+ if (inode) {
+ cinode = CIFS_I(inode);
+ if (CIFS_CACHE_READ(cinode))
+ lease[n++] = 'R';
+ if (CIFS_CACHE_HANDLE(cinode))
+ lease[n++] = 'H';
+ if (CIFS_CACHE_WRITE(cinode))
+ lease[n++] = 'W';
+ }
+ lease[n] = '\0';
+ seq_puts(m, " ");
+ if (n)
+ seq_printf(m, "%s", lease);
+ else
+ seq_puts(m, "NONE");
+
#ifdef CONFIG_CIFS_DEBUG2
- seq_printf(m, " %llu\n", cfile->fid.mid);
-#else
+ seq_printf(m, " %llu", cfile->fid.mid);
+#endif /* CONFIG_CIFS_DEBUG2 */
seq_printf(m, "\n");
-#endif /* CIFS_DEBUG2 */
}
spin_unlock(&tcon->open_file_lock);
}
@@ -308,7 +331,10 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
if (!cfids)
continue;
spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
- seq_printf(m, "Num entries: %d\n", cfids->num_entries);
+ seq_printf(m, "Num entries: %d, cached_dirents: %lu entries, %llu bytes\n",
+ cfids->num_entries,
+ (unsigned long)atomic_long_read(&cfids->total_dirents_entries),
+ (unsigned long long)atomic64_read(&cfids->total_dirents_bytes));
list_for_each_entry(cfid, &cfids->entries, entry) {
seq_printf(m, "0x%x 0x%llx 0x%llx %s",
tcon->tid,
@@ -319,6 +345,9 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\tvalid file info");
if (cfid->dirents.is_valid)
seq_printf(m, ", valid dirents");
+ if (!list_empty(&cfid->dirents.entries))
+ seq_printf(m, ", dirents: %lu entries, %lu bytes",
+ cfid->dirents.entries_count, cfid->dirents.bytes_used);
seq_printf(m, "\n");
}
spin_unlock(&cfids->cfid_list_lock);
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 3cc686246908..7b7c8c38fdd0 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -22,8 +22,8 @@
#include <linux/highmem.h>
#include <linux/fips.h>
#include <linux/iov_iter.h>
-#include "../common/arc4.h"
#include <crypto/aead.h>
+#include <crypto/arc4.h>
static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len,
void *priv, void *priv2)
@@ -725,9 +725,9 @@ calc_seckey(struct cifs_ses *ses)
return -ENOMEM;
}
- cifs_arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
- cifs_arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
- CIFS_CPHTXT_SIZE);
+ arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
+ arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
+ CIFS_CPHTXT_SIZE);
/* make secondary_key/nonce as session key */
memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index dcb39d1b5958..1775c2b7528f 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -121,6 +121,46 @@ unsigned int dir_cache_timeout = 30;
module_param(dir_cache_timeout, uint, 0644);
MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 "
"Range: 1 to 65000 seconds, 0 to disable caching dir contents");
+/* Module-wide total cached dirents (in bytes) across all tcons */
+atomic64_t cifs_dircache_bytes_used = ATOMIC64_INIT(0);
+
+/*
+ * Write-only module parameter to drop all cached directory entries across
+ * all CIFS mounts. Echo a non-zero value to trigger.
+ */
+static void cifs_drop_all_dir_caches(void)
+{
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list)
+ invalidate_all_cached_dirs(tcon);
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
+static int cifs_param_set_drop_dir_cache(const char *val, const struct kernel_param *kp)
+{
+ bool bv;
+ int rc = kstrtobool(val, &bv);
+
+ if (rc)
+ return rc;
+ if (bv)
+ cifs_drop_all_dir_caches();
+ return 0;
+}
+
+module_param_call(drop_dir_cache, cifs_param_set_drop_dir_cache, NULL, NULL, 0200);
+MODULE_PARM_DESC(drop_dir_cache, "Write 1 to drop all cached directory entries across all CIFS mounts");
+
#ifdef CONFIG_CIFS_STATS2
unsigned int slow_rsp_threshold = 1;
module_param(slow_rsp_threshold, uint, 0644);
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 5223edf6d11a..fc67a6441c96 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -322,13 +322,14 @@ retry_open:
list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) {
if (parent_cfid->dentry == direntry->d_parent) {
cifs_dbg(FYI, "found a parent cached file handle\n");
- if (parent_cfid->has_lease && parent_cfid->time) {
+ if (is_valid_cached_dir(parent_cfid)) {
lease_flags
|= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
memcpy(fid->parent_lease_key,
parent_cfid->fid.lease_key,
SMB2_LEASE_KEY_SIZE);
parent_cfid->dirents.is_valid = false;
+ parent_cfid->dirents.is_failed = true;
}
break;
}
@@ -484,8 +485,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
* in network traffic in the other paths.
*/
if (!(oflags & O_CREAT)) {
- struct dentry *res;
-
/*
* Check for hashed negative dentry. We have already revalidated
* the dentry and it is fine. No need to perform another lookup.
@@ -493,11 +492,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
if (!d_in_lookup(direntry))
return -ENOENT;
- res = cifs_lookup(inode, direntry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- return finish_no_open(file, res);
+ return finish_no_open(file, cifs_lookup(inode, direntry, 0));
}
xid = get_xid();
@@ -683,6 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
const char *full_path;
void *page;
int retry_count = 0;
+ struct cached_fid *cfid = NULL;
xid = get_xid();
@@ -722,6 +718,28 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
cifs_dbg(FYI, "non-NULL inode in lookup\n");
} else {
cifs_dbg(FYI, "NULL inode in lookup\n");
+
+ /*
+ * We can only rely on negative dentries having the same
+ * spelling as the cached dirent if case insensitivity is
+ * forced on mount.
+ *
+ * XXX: if servers correctly announce Case Sensitivity Search
+ * on GetInfo of FileFSAttributeInformation, then we can take
+ * correct action even if case insensitive is not forced on
+ * mount.
+ */
+ if (pTcon->nocase && !open_cached_dir_by_dentry(pTcon, direntry->d_parent, &cfid)) {
+ /*
+ * dentry is negative and parent is fully cached:
+ * we can assume file does not exist
+ */
+ if (cfid->dirents.is_valid) {
+ close_cached_dir(cfid);
+ goto out;
+ }
+ close_cached_dir(cfid);
+ }
}
cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
full_path, d_inode(direntry));
@@ -755,6 +773,8 @@ again:
}
newInode = ERR_PTR(rc);
}
+
+out:
free_dentry_path(page);
cifs_put_tlink(tlink);
free_xid(xid);
@@ -765,7 +785,8 @@ static int
cifs_d_revalidate(struct inode *dir, const struct qstr *name,
struct dentry *direntry, unsigned int flags)
{
- struct inode *inode;
+ struct inode *inode = NULL;
+ struct cached_fid *cfid;
int rc;
if (flags & LOOKUP_RCU)
@@ -812,6 +833,21 @@ cifs_d_revalidate(struct inode *dir, const struct qstr *name,
return 1;
}
+ } else {
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dir->i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ if (!open_cached_dir_by_dentry(tcon, direntry->d_parent, &cfid)) {
+ /*
+ * dentry is negative and parent is fully cached:
+ * we can assume file does not exist
+ */
+ if (cfid->dirents.is_valid) {
+ close_cached_dir(cfid);
+ return 1;
+ }
+ close_cached_dir(cfid);
+ }
}
/*
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 072383899e81..e60927b2a7c8 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -773,16 +773,14 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc,
}
- len = 0;
value = strchr(key, '=');
if (value) {
if (value == key)
continue;
*value++ = 0;
- len = strlen(value);
}
- ret = vfs_parse_fs_string(fc, key, value, len);
+ ret = vfs_parse_fs_string(fc, key, value);
if (ret < 0)
break;
}
@@ -1820,6 +1818,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
}
+ /*
+ * Multichannel is not meaningful if max_channels is 1.
+ * Force multichannel to false to ensure consistent configuration.
+ */
+ if (ctx->multichannel && ctx->max_channels == 1)
+ ctx->multichannel = false;
+
return 0;
cifs_parse_mount_err:
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 7e9784080501..8bb544be401e 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2704,7 +2704,7 @@ cifs_dentry_needs_reval(struct dentry *dentry)
return true;
if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) {
- if (cfid->time && cifs_i->time > cfid->time) {
+ if (cifs_i->time > cfid->time) {
close_cached_dir(cfid);
return false;
}
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 4e5460206397..f0ce26622a14 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -874,39 +874,42 @@ static void finished_cached_dirents_count(struct cached_dirents *cde,
cde->is_valid = 1;
}
-static void add_cached_dirent(struct cached_dirents *cde,
- struct dir_context *ctx,
- const char *name, int namelen,
- struct cifs_fattr *fattr,
- struct file *file)
+static bool add_cached_dirent(struct cached_dirents *cde,
+ struct dir_context *ctx, const char *name,
+ int namelen, struct cifs_fattr *fattr,
+ struct file *file)
{
struct cached_dirent *de;
if (cde->file != file)
- return;
+ return false;
if (cde->is_valid || cde->is_failed)
- return;
+ return false;
if (ctx->pos != cde->pos) {
cde->is_failed = 1;
- return;
+ return false;
}
de = kzalloc(sizeof(*de), GFP_ATOMIC);
if (de == NULL) {
cde->is_failed = 1;
- return;
+ return false;
}
de->namelen = namelen;
de->name = kstrndup(name, namelen, GFP_ATOMIC);
if (de->name == NULL) {
kfree(de);
cde->is_failed = 1;
- return;
+ return false;
}
de->pos = ctx->pos;
memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr));
list_add_tail(&de->entry, &cde->entries);
+ /* update accounting */
+ cde->entries_count++;
+ cde->bytes_used += sizeof(*de) + (size_t)namelen + 1;
+ return true;
}
static bool cifs_dir_emit(struct dir_context *ctx,
@@ -915,7 +918,8 @@ static bool cifs_dir_emit(struct dir_context *ctx,
struct cached_fid *cfid,
struct file *file)
{
- bool rc;
+ size_t delta_bytes = 0;
+ bool rc, added = false;
ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype);
@@ -923,10 +927,20 @@ static bool cifs_dir_emit(struct dir_context *ctx,
return rc;
if (cfid) {
+ /* Cost of this entry */
+ delta_bytes = sizeof(struct cached_dirent) + (size_t)namelen + 1;
+
mutex_lock(&cfid->dirents.de_mutex);
- add_cached_dirent(&cfid->dirents, ctx, name, namelen,
- fattr, file);
+ added = add_cached_dirent(&cfid->dirents, ctx, name, namelen,
+ fattr, file);
mutex_unlock(&cfid->dirents.de_mutex);
+
+ if (added) {
+ /* per-tcon then global for consistency with free path */
+ atomic64_add((long long)delta_bytes, &cfid->cfids->total_dirents_bytes);
+ atomic_long_inc(&cfid->cfids->total_dirents_entries);
+ atomic64_add((long long)delta_bytes, &cifs_dircache_bytes_used);
+ }
}
return rc;
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 4711a23c5b38..058050f744c0 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -954,11 +954,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid);
if (!rc) {
- if (cfid->has_lease) {
- close_cached_dir(cfid);
- return 0;
- }
close_cached_dir(cfid);
+ return 0;
}
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
@@ -4219,7 +4216,7 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
int num_rqst, const u8 *sig, u8 **iv,
struct aead_request **req, struct sg_table *sgt,
- unsigned int *num_sgs, size_t *sensitive_size)
+ unsigned int *num_sgs)
{
unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
unsigned int iv_size = crypto_aead_ivsize(tfm);
@@ -4236,9 +4233,8 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst
len += req_size;
len = ALIGN(len, __alignof__(struct scatterlist));
len += array_size(*num_sgs, sizeof(struct scatterlist));
- *sensitive_size = len;
- p = kvzalloc(len, GFP_NOFS);
+ p = kzalloc(len, GFP_NOFS);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -4252,16 +4248,14 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst
static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst,
int num_rqst, const u8 *sig, u8 **iv,
- struct aead_request **req, struct scatterlist **sgl,
- size_t *sensitive_size)
+ struct aead_request **req, struct scatterlist **sgl)
{
struct sg_table sgtable = {};
unsigned int skip, num_sgs, i, j;
ssize_t rc;
void *p;
- p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable,
- &num_sgs, sensitive_size);
+ p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, &num_sgs);
if (IS_ERR(p))
return ERR_CAST(p);
@@ -4350,7 +4344,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
DECLARE_CRYPTO_WAIT(wait);
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
- size_t sensitive_size;
rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
@@ -4376,8 +4369,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg,
- &sensitive_size);
+ creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
if (IS_ERR(creq))
return PTR_ERR(creq);
@@ -4407,7 +4399,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kvfree_sensitive(creq, sensitive_size);
+ kfree_sensitive(creq);
return rc;
}
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 1c63d2c9cc9c..42e2d4ea344d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -240,8 +240,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
*/
if (smb2_command != SMB2_TREE_DISCONNECT) {
spin_unlock(&tcon->tc_lock);
- cifs_dbg(FYI, "can not send cmd %d while umounting\n",
- smb2_command);
+ cifs_tcon_dbg(FYI, "can not send cmd %d while umounting\n",
+ smb2_command);
return -ENODEV;
}
}
@@ -296,9 +296,9 @@ again:
return 0;
}
spin_unlock(&ses->chan_lock);
- cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d",
- tcon->ses->chans_need_reconnect,
- tcon->need_reconnect);
+ cifs_tcon_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d\n",
+ tcon->ses->chans_need_reconnect,
+ tcon->need_reconnect);
mutex_lock(&ses->session_mutex);
/*
@@ -392,11 +392,11 @@ skip_sess_setup:
rc = cifs_tree_connect(0, tcon);
- cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
+ cifs_tcon_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc) {
/* If sess reconnected but tcon didn't, something strange ... */
mutex_unlock(&ses->session_mutex);
- cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
+ cifs_tcon_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
goto out;
}
@@ -442,8 +442,8 @@ skip_sess_setup:
from_reconnect);
goto skip_add_channels;
} else if (rc)
- cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
- __func__, rc);
+ cifs_tcon_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
if (ses->chan_max > ses->chan_count &&
ses->iface_count &&
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index a61ba7f3fb86..051cd9dbba13 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -22,6 +22,7 @@
#include <linux/mempool.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/task_work.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -173,9 +174,16 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
* send a packet. In most cases if we fail to send
* after the retries we will kill the socket and
* reconnect which may clear the network problem.
+ *
+ * Even if regular signals are masked, EINTR might be
+ * propagated from sk_stream_wait_memory() to here when
+ * TIF_NOTIFY_SIGNAL is used for task work. For example,
+ * certain io_uring completions will use that. Treat
+ * having EINTR with pending task work the same as EAGAIN
+ * to avoid unnecessary reconnects.
*/
rc = sock_sendmsg(ssocket, smb_msg);
- if (rc == -EAGAIN) {
+ if (rc == -EAGAIN || unlikely(rc == -EINTR && task_work_pending(current))) {
retries++;
if (retries >= 14 ||
(!server->noblocksnd && (retries > 2))) {
@@ -323,8 +331,7 @@ int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
break;
total_len += sent;
}
-
-}
+ }
unmask:
sigprocmask(SIG_SETMASK, &oldmask, NULL);
diff --git a/fs/smb/common/Makefile b/fs/smb/common/Makefile
index c66dbbc1469c..9e0730a385fb 100644
--- a/fs/smb/common/Makefile
+++ b/fs/smb/common/Makefile
@@ -3,5 +3,4 @@
# Makefile for Linux filesystem routines that are shared by client and server.
#
-obj-$(CONFIG_SMBFS) += cifs_arc4.o
obj-$(CONFIG_SMBFS) += cifs_md4.o
diff --git a/fs/smb/common/arc4.h b/fs/smb/common/arc4.h
deleted file mode 100644
index 12e71ec033a1..000000000000
--- a/fs/smb/common/arc4.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Common values for ARC4 Cipher Algorithm
- */
-
-#ifndef _CRYPTO_ARC4_H
-#define _CRYPTO_ARC4_H
-
-#include <linux/types.h>
-
-#define ARC4_MIN_KEY_SIZE 1
-#define ARC4_MAX_KEY_SIZE 256
-#define ARC4_BLOCK_SIZE 1
-
-struct arc4_ctx {
- u32 S[256];
- u32 x, y;
-};
-
-int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len);
-void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len);
-
-#endif /* _CRYPTO_ARC4_H */
diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c
deleted file mode 100644
index df360ca47826..000000000000
--- a/fs/smb/common/cifs_arc4.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API
- *
- * ARC4 Cipher Algorithm
- *
- * Jon Oberheide <jon@oberheide.org>
- */
-
-#include <linux/module.h>
-#include "arc4.h"
-
-MODULE_DESCRIPTION("ARC4 Cipher Algorithm");
-MODULE_LICENSE("GPL");
-
-int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
-{
- int i, j = 0, k = 0;
-
- ctx->x = 1;
- ctx->y = 0;
-
- for (i = 0; i < 256; i++)
- ctx->S[i] = i;
-
- for (i = 0; i < 256; i++) {
- u32 a = ctx->S[i];
-
- j = (j + in_key[k] + a) & 0xff;
- ctx->S[i] = ctx->S[j];
- ctx->S[j] = a;
- if (++k >= key_len)
- k = 0;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(cifs_arc4_setkey);
-
-void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
-{
- u32 *const S = ctx->S;
- u32 x, y, a, b;
- u32 ty, ta, tb;
-
- if (len == 0)
- return;
-
- x = ctx->x;
- y = ctx->y;
-
- a = S[x];
- y = (y + a) & 0xff;
- b = S[y];
-
- do {
- S[y] = a;
- a = (a + b) & 0xff;
- S[x] = b;
- x = (x + 1) & 0xff;
- ta = S[x];
- ty = (y + ta) & 0xff;
- tb = S[ty];
- *out++ = *in++ ^ S[a];
- if (--len == 0)
- break;
- y = ty;
- a = ta;
- b = tb;
- } while (true);
-
- ctx->x = x;
- ctx->y = y;
-}
-EXPORT_SYMBOL_GPL(cifs_arc4_crypt);
diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig
index 4a23a5e7e8fe..098cac98d31e 100644
--- a/fs/smb/server/Kconfig
+++ b/fs/smb/server/Kconfig
@@ -10,6 +10,7 @@ config SMB_SERVER
select CRYPTO_MD5
select CRYPTO_HMAC
select CRYPTO_ECB
+ select CRYPTO_LIB_ARC4
select CRYPTO_LIB_DES
select CRYPTO_LIB_SHA256
select CRYPTO_SHA256
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index d99871c21451..b4020bb55a26 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -20,6 +20,7 @@
#include "glob.h"
#include <linux/fips.h>
+#include <crypto/arc4.h>
#include <crypto/des.h>
#include "server.h"
@@ -29,7 +30,6 @@
#include "mgmt/user_config.h"
#include "crypto_ctx.h"
#include "transport_ipc.h"
-#include "../common/arc4.h"
/*
* Fixed format data defining GSS header and fixed string
@@ -365,10 +365,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
if (!ctx_arc4)
return -ENOMEM;
- cifs_arc4_setkey(ctx_arc4, sess->sess_key,
- SMB2_NTLMV2_SESSKEY_SIZE);
- cifs_arc4_crypt(ctx_arc4, sess->sess_key,
- (char *)authblob + sess_key_off, sess_key_len);
+ arc4_setkey(ctx_arc4, sess->sess_key, SMB2_NTLMV2_SESSKEY_SIZE);
+ arc4_crypt(ctx_arc4, sess->sess_key,
+ (char *)authblob + sess_key_off, sess_key_len);
kfree_sensitive(ctx_arc4);
}
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 91a934411134..b6b4f1286b9c 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -19,7 +19,7 @@ static DEFINE_MUTEX(init_lock);
static struct ksmbd_conn_ops default_conn_ops;
-LIST_HEAD(conn_list);
+DEFINE_HASHTABLE(conn_list, CONN_HASH_BITS);
DECLARE_RWSEM(conn_list_lock);
/**
@@ -33,7 +33,7 @@ DECLARE_RWSEM(conn_list_lock);
void ksmbd_conn_free(struct ksmbd_conn *conn)
{
down_write(&conn_list_lock);
- list_del(&conn->conns_list);
+ hash_del(&conn->hlist);
up_write(&conn_list_lock);
xa_destroy(&conn->sessions);
@@ -77,7 +77,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
init_waitqueue_head(&conn->req_running_q);
init_waitqueue_head(&conn->r_count_q);
- INIT_LIST_HEAD(&conn->conns_list);
INIT_LIST_HEAD(&conn->requests);
INIT_LIST_HEAD(&conn->async_requests);
spin_lock_init(&conn->request_lock);
@@ -90,19 +89,17 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
init_rwsem(&conn->session_lock);
- down_write(&conn_list_lock);
- list_add(&conn->conns_list, &conn_list);
- up_write(&conn_list_lock);
return conn;
}
bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c)
{
struct ksmbd_conn *t;
+ int bkt;
bool ret = false;
down_read(&conn_list_lock);
- list_for_each_entry(t, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, t, hlist) {
if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE))
continue;
@@ -163,9 +160,10 @@ void ksmbd_conn_unlock(struct ksmbd_conn *conn)
void ksmbd_all_conn_set_status(u64 sess_id, u32 status)
{
struct ksmbd_conn *conn;
+ int bkt;
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
if (conn->binding || xa_load(&conn->sessions, sess_id))
WRITE_ONCE(conn->status, status);
}
@@ -181,14 +179,14 @@ int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id)
{
struct ksmbd_conn *conn;
int rc, retry_count = 0, max_timeout = 120;
- int rcount = 1;
+ int rcount = 1, bkt;
retry_idle:
if (retry_count >= max_timeout)
return -EIO;
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
if (conn->binding || xa_load(&conn->sessions, sess_id)) {
if (conn == curr_conn)
rcount = 2;
@@ -480,10 +478,11 @@ static void stop_sessions(void)
{
struct ksmbd_conn *conn;
struct ksmbd_transport *t;
+ int bkt;
again:
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
t = conn->transport;
ksmbd_conn_set_exiting(conn);
if (t->ops->shutdown) {
@@ -494,7 +493,7 @@ again:
}
up_read(&conn_list_lock);
- if (!list_empty(&conn_list)) {
+ if (!hash_empty(conn_list)) {
msleep(100);
goto again;
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 07b43634262a..7f9bcd9817b5 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -54,11 +54,12 @@ struct ksmbd_conn {
u8 inet6_addr[16];
#endif
};
+ unsigned int inet_hash;
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
struct unicode_map *um;
- struct list_head conns_list;
+ struct hlist_node hlist;
struct rw_semaphore session_lock;
/* smb session 1 per user */
struct xarray sessions;
@@ -153,7 +154,8 @@ struct ksmbd_transport {
#define KSMBD_TCP_SEND_TIMEOUT (5 * HZ)
#define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr))
-extern struct list_head conn_list;
+#define CONN_HASH_BITS 12
+extern DECLARE_HASHTABLE(conn_list, CONN_HASH_BITS);
extern struct rw_semaphore conn_list_lock;
bool ksmbd_conn_alive(struct ksmbd_conn *conn);
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 3f07a612c05b..8ccd57fd904b 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -112,10 +112,11 @@ struct ksmbd_startup_request {
__u32 smbd_max_io_size; /* smbd read write size */
__u32 max_connections; /* Number of maximum simultaneous connections */
__s8 bind_interfaces_only;
- __s8 reserved[503]; /* Reserved room */
+ __u32 max_ip_connections; /* Number of maximum connection per ip address */
+ __s8 reserved[499]; /* Reserved room */
__u32 ifc_list_sz; /* interfaces list size */
__s8 ____payload[];
-};
+} __packed;
#define KSMBD_STARTUP_CONFIG_INTERFACES(s) ((s)->____payload)
diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index d3d5f99bdd34..c9b1108d6e96 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -19,7 +19,7 @@
#include "../transport_ipc.h"
#include "../misc.h"
-#define SHARE_HASH_BITS 3
+#define SHARE_HASH_BITS 12
static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS);
static DECLARE_RWSEM(shares_table_lock);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 9dec4c2940bc..6fa025374f2f 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -18,7 +18,7 @@
static DEFINE_IDA(session_ida);
-#define SESSION_HASH_BITS 3
+#define SESSION_HASH_BITS 12
static DEFINE_HASHTABLE(sessions_table, SESSION_HASH_BITS);
static DECLARE_RWSEM(sessions_table_lock);
@@ -104,29 +104,32 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
if (!entry)
return -ENOMEM;
- down_read(&sess->rpc_lock);
entry->method = method;
entry->id = id = ksmbd_ipc_id_alloc();
if (id < 0)
goto free_entry;
+
+ down_write(&sess->rpc_lock);
old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP);
- if (xa_is_err(old))
+ if (xa_is_err(old)) {
+ up_write(&sess->rpc_lock);
goto free_id;
+ }
resp = ksmbd_rpc_open(sess, id);
- if (!resp)
- goto erase_xa;
+ if (!resp) {
+ xa_erase(&sess->rpc_handle_list, entry->id);
+ up_write(&sess->rpc_lock);
+ goto free_id;
+ }
- up_read(&sess->rpc_lock);
+ up_write(&sess->rpc_lock);
kvfree(resp);
return id;
-erase_xa:
- xa_erase(&sess->rpc_handle_list, entry->id);
free_id:
ksmbd_rpc_id_free(entry->id);
free_entry:
kfree(entry);
- up_read(&sess->rpc_lock);
return -EINVAL;
}
@@ -144,9 +147,14 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id)
{
struct ksmbd_session_rpc *entry;
+ int method;
+ down_read(&sess->rpc_lock);
entry = xa_load(&sess->rpc_handle_list, id);
- return entry ? entry->method : 0;
+ method = entry ? entry->method : 0;
+ up_read(&sess->rpc_lock);
+
+ return method;
}
void ksmbd_session_destroy(struct ksmbd_session *sess)
diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h
index 995555febe7d..b8a7317be86b 100644
--- a/fs/smb/server/server.h
+++ b/fs/smb/server/server.h
@@ -43,6 +43,7 @@ struct ksmbd_server_config {
unsigned int auth_mechs;
unsigned int max_connections;
unsigned int max_inflight_req;
+ unsigned int max_ip_connections;
char *conf[SERVER_CONF_WORK_GROUP + 1];
struct task_struct *dh_task;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0c069eff80b7..ab1d45fcebde 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -5629,7 +5629,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
if (!work->tcon->posix_extensions) {
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
- rc = -EOPNOTSUPP;
+ path_put(&path);
+ return -EOPNOTSUPP;
} else {
info = (struct filesystem_posix_info *)(rsp->Buffer);
info->OptimalTransferSize = cpu_to_le32(stfs.f_bsize);
@@ -7361,7 +7362,7 @@ int smb2_lock(struct ksmbd_work *work)
int nolock = 0;
LIST_HEAD(lock_list);
LIST_HEAD(rollback_list);
- int prior_lock = 0;
+ int prior_lock = 0, bkt;
WORK_BUFFERS(work, req, rsp);
@@ -7471,7 +7472,7 @@ int smb2_lock(struct ksmbd_work *work)
nolock = 1;
/* check locks in connection list */
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
spin_lock(&conn->llist_lock);
list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) {
if (file_inode(cmp_lock->fl->c.flc_file) !=
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 2a3e2b0ce557..2aa1b29bea08 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -335,6 +335,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
if (req->max_connections)
server_conf.max_connections = req->max_connections;
+ if (req->max_ip_connections)
+ server_conf.max_ip_connections = req->max_ip_connections;
+
ret = ksmbd_set_netbios_name(req->netbios_name);
ret |= ksmbd_set_server_string(req->server_string);
ret |= ksmbd_set_work_group(req->work_group);
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 9e644a0daf1c..b3077766d6ec 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -425,6 +425,11 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
conn = ksmbd_conn_alloc();
if (!conn)
goto err;
+
+ down_write(&conn_list_lock);
+ hash_add(conn_list, &conn->hlist, 0);
+ up_write(&conn_list_lock);
+
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops;
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 4337df97987d..7a1e3dcc2cde 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -86,13 +86,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
}
#if IS_ENABLED(CONFIG_IPV6)
- if (client_sk->sk->sk_family == AF_INET6)
+ if (client_sk->sk->sk_family == AF_INET6) {
memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16);
- else
+ conn->inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr);
+ } else {
conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+ conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
+ }
#else
conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+ conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
#endif
+ down_write(&conn_list_lock);
+ hash_add(conn_list, &conn->hlist, conn->inet_hash);
+ up_write(&conn_list_lock);
+
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops;
@@ -170,17 +178,6 @@ static struct kvec *get_conn_iovec(struct tcp_transport *t, unsigned int nr_segs
return new_iov;
}
-static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa)
-{
- switch (sa->sa_family) {
- case AF_INET:
- return ntohs(((struct sockaddr_in *)sa)->sin_port);
- case AF_INET6:
- return ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
- }
- return 0;
-}
-
/**
* ksmbd_tcp_new_connection() - create a new tcp session on mount
* @client_sk: socket associated with new connection
@@ -192,7 +189,6 @@ static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa)
*/
static int ksmbd_tcp_new_connection(struct socket *client_sk)
{
- struct sockaddr *csin;
int rc = 0;
struct tcp_transport *t;
struct task_struct *handler;
@@ -203,27 +199,26 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
return -ENOMEM;
}
- csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn);
- if (kernel_getpeername(client_sk, csin) < 0) {
- pr_err("client ip resolution failed\n");
- rc = -EINVAL;
- goto out_error;
- }
-
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6)
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:%pI6c",
+ &KSMBD_TRANS(t)->conn->inet6_addr);
+ else
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:%pI4",
+ &KSMBD_TRANS(t)->conn->inet_addr);
+#else
handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn,
- "ksmbd:%u",
- ksmbd_tcp_get_port(csin));
+ KSMBD_TRANS(t)->conn, "ksmbd:%pI4",
+ &KSMBD_TRANS(t)->conn->inet_addr);
+#endif
if (IS_ERR(handler)) {
pr_err("cannot start conn thread\n");
rc = PTR_ERR(handler);
free_transport(t);
}
return rc;
-
-out_error:
- free_transport(t);
- return rc;
}
/**
@@ -237,7 +232,8 @@ static int ksmbd_kthread_fn(void *p)
struct socket *client_sk = NULL;
struct interface *iface = (struct interface *)p;
struct ksmbd_conn *conn;
- int ret;
+ int ret, inet_hash;
+ unsigned int max_ip_conns;
while (!kthread_should_stop()) {
mutex_lock(&iface->sock_release_lock);
@@ -255,34 +251,49 @@ static int ksmbd_kthread_fn(void *p)
continue;
}
+ if (!server_conf.max_ip_connections)
+ goto skip_max_ip_conns_limit;
+
/*
* Limits repeated connections from clients with the same IP.
*/
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6)
+ inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr);
+ else
+ inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
+#else
+ inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
+#endif
+
+ max_ip_conns = 0;
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list)
+ hash_for_each_possible(conn_list, conn, hlist, inet_hash) {
#if IS_ENABLED(CONFIG_IPV6)
if (client_sk->sk->sk_family == AF_INET6) {
if (memcmp(&client_sk->sk->sk_v6_daddr,
- &conn->inet6_addr, 16) == 0) {
- ret = -EAGAIN;
- break;
- }
+ &conn->inet6_addr, 16) == 0)
+ max_ip_conns++;
} else if (inet_sk(client_sk->sk)->inet_daddr ==
- conn->inet_addr) {
- ret = -EAGAIN;
- break;
- }
+ conn->inet_addr)
+ max_ip_conns++;
#else
if (inet_sk(client_sk->sk)->inet_daddr ==
- conn->inet_addr) {
+ conn->inet_addr)
+ max_ip_conns++;
+#endif
+ if (server_conf.max_ip_connections <= max_ip_conns) {
+ pr_info_ratelimited("Maximum IP connections exceeded (%u/%u)\n",
+ max_ip_conns, server_conf.max_ip_connections);
ret = -EAGAIN;
break;
}
-#endif
+ }
up_read(&conn_list_lock);
if (ret == -EAGAIN)
continue;
+skip_max_ip_conns_limit:
if (server_conf.max_connections &&
atomic_inc_return(&active_num_conn) >= server_conf.max_connections) {
pr_info_ratelimited("Limit the maximum number of connections(%u)\n",
@@ -468,12 +479,13 @@ static int create_socket(struct interface *iface)
struct socket *ksmbd_socket;
bool ipv4 = false;
- ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
+ ret = sock_create_kern(current->nsproxy->net_ns, PF_INET6, SOCK_STREAM,
+ IPPROTO_TCP, &ksmbd_socket);
if (ret) {
if (ret != -EAFNOSUPPORT)
pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret);
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP,
- &ksmbd_socket);
+ ret = sock_create_kern(current->nsproxy->net_ns, PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
if (ret) {
pr_err("Can't create socket for ipv4: %d\n", ret);
goto out_clear;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 1cfa688904b2..c96e5d934ba9 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -20,6 +20,7 @@
#include <linux/sched/xacct.h>
#include <linux/crc32c.h>
#include <linux/namei.h>
+#include <linux/splice.h>
#include "glob.h"
#include "oplock.h"
@@ -1829,8 +1830,19 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
if (src_off + len > src_file_size)
return -E2BIG;
- ret = vfs_copy_file_range(src_fp->filp, src_off,
- dst_fp->filp, dst_off, len, 0);
+ /*
+ * vfs_copy_file_range does not allow overlapped copying
+ * within the same file.
+ */
+ if (file_inode(src_fp->filp) == file_inode(dst_fp->filp) &&
+ dst_off + len > src_off &&
+ dst_off < src_off + len)
+ ret = do_splice_direct(src_fp->filp, &src_off,
+ dst_fp->filp, &dst_off,
+ min_t(size_t, len, MAX_RW_COUNT), 0);
+ else
+ ret = vfs_copy_file_range(src_fp->filp, src_off,
+ dst_fp->filp, dst_off, len, 0);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
ret = vfs_copy_file_range(src_fp->filp, src_off,
dst_fp->filp, dst_off, len,
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index ce7d661d5ad8..1582e0637a7e 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -307,7 +307,8 @@ static int fill_meta_index(struct inode *inode, int index,
all_done:
*index_block = cur_index_block;
*index_offset = cur_offset;
- *data_block = cur_data_block;
+ if (data_block)
+ *data_block = cur_data_block;
/*
* Scale cache index (cache slot entry) to index
@@ -324,17 +325,15 @@ failed:
* Get the on-disk location and compressed size of the datablock
* specified by index. Fill_meta_index() does most of the work.
*/
-static int read_blocklist(struct inode *inode, int index, u64 *block)
+static int read_blocklist_ptrs(struct inode *inode, int index, u64 *start,
+ int *offset, u64 *block)
{
- u64 start;
long long blks;
- int offset;
__le32 size;
- int res = fill_meta_index(inode, index, &start, &offset, block);
+ int res = fill_meta_index(inode, index, start, offset, block);
- TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
- " 0x%x, block 0x%llx\n", res, index, start, offset,
- *block);
+ TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset 0x%x, block 0x%llx\n",
+ res, index, *start, *offset, block ? *block : 0);
if (res < 0)
return res;
@@ -346,22 +345,31 @@ static int read_blocklist(struct inode *inode, int index, u64 *block)
* extra block indexes needed.
*/
if (res < index) {
- blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+ blks = read_indexes(inode->i_sb, index - res, start, offset);
if (blks < 0)
return (int) blks;
- *block += blks;
+ if (block)
+ *block += blks;
}
/*
* Read length of block specified by index.
*/
- res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+ res = squashfs_read_metadata(inode->i_sb, &size, start, offset,
sizeof(size));
if (res < 0)
return res;
return squashfs_block_size(size);
}
+static inline int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+ u64 start;
+ int offset;
+
+ return read_blocklist_ptrs(inode, index, &start, &offset, block);
+}
+
static bool squashfs_fill_page(struct folio *folio,
struct squashfs_cache_entry *buffer, size_t offset,
size_t avail)
@@ -658,7 +666,114 @@ skip_pages:
kfree(pages);
}
+static loff_t seek_hole_data(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ u64 start, index = offset >> msblk->block_log;
+ u64 file_end = (i_size_read(inode) + msblk->block_size - 1) >> msblk->block_log;
+ int s_offset, length;
+ __le32 *blist = NULL;
+
+ /* reject offset if negative or beyond file end */
+ if ((unsigned long long)offset >= i_size_read(inode))
+ return -ENXIO;
+
+ /* is offset within tailend and is tailend packed into a fragment? */
+ if (index + 1 == file_end &&
+ squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) {
+ if (whence == SEEK_DATA)
+ return offset;
+
+ /* there is an implicit hole at the end of any file */
+ return i_size_read(inode);
+ }
+
+ length = read_blocklist_ptrs(inode, index, &start, &s_offset, NULL);
+ if (length < 0)
+ return length;
+
+ /* nothing more to do if offset matches desired whence value */
+ if ((length == 0 && whence == SEEK_HOLE) ||
+ (length && whence == SEEK_DATA))
+ return offset;
+
+ /* skip scanning forwards if we're at file end */
+ if (++ index == file_end)
+ goto not_found;
+
+ blist = kmalloc(SQUASHFS_SCAN_INDEXES << 2, GFP_KERNEL);
+ if (blist == NULL) {
+ ERROR("%s: Failed to allocate block_list\n", __func__);
+ return -ENOMEM;
+ }
+
+ while (index < file_end) {
+ int i, indexes = min(file_end - index, SQUASHFS_SCAN_INDEXES);
+
+ offset = squashfs_read_metadata(sb, blist, &start, &s_offset, indexes << 2);
+ if (offset < 0)
+ goto finished;
+
+ for (i = 0; i < indexes; i++) {
+ length = squashfs_block_size(blist[i]);
+ if (length < 0) {
+ offset = length;
+ goto finished;
+ }
+
+ /* does this block match desired whence value? */
+ if ((length == 0 && whence == SEEK_HOLE) ||
+ (length && whence == SEEK_DATA)) {
+ offset = (index + i) << msblk->block_log;
+ goto finished;
+ }
+ }
+
+ index += indexes;
+ }
+
+not_found:
+ /* whence value determines what happens */
+ if (whence == SEEK_DATA)
+ offset = -ENXIO;
+ else
+ /* there is an implicit hole at the end of any file */
+ offset = i_size_read(inode);
+
+finished:
+ kfree(blist);
+ return offset;
+}
+
+static loff_t squashfs_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ switch (whence) {
+ default:
+ return generic_file_llseek(file, offset, whence);
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ offset = seek_hole_data(file, offset, whence);
+ break;
+ }
+
+ if (offset < 0)
+ return offset;
+
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
const struct address_space_operations squashfs_aops = {
.read_folio = squashfs_read_folio,
.readahead = squashfs_readahead
};
+
+const struct file_operations squashfs_file_operations = {
+ .llseek = squashfs_llseek,
+ .read_iter = generic_file_read_iter,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
+ .splice_read = filemap_splice_read
+};
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index d5918eba27e3..cceae3b78698 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -68,6 +68,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
inode->i_mode = le16_to_cpu(sqsh_ino->mode);
inode->i_size = 0;
+ /* File type must not be set at this moment, for it will later be set by the caller. */
+ if (inode->i_mode & S_IFMT)
+ err = -EIO;
+
return err;
}
@@ -140,8 +144,17 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
+ inode->i_size = le32_to_cpu(sqsh_ino->file_size);
frag = le32_to_cpu(sqsh_ino->fragment);
if (frag != SQUASHFS_INVALID_FRAG) {
+ /*
+ * the file cannot have a fragment (tailend) and have a
+ * file size a multiple of the block size
+ */
+ if ((inode->i_size & (msblk->block_size - 1)) == 0) {
+ err = -EINVAL;
+ goto failed_read;
+ }
frag_offset = le32_to_cpu(sqsh_ino->offset);
frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
if (frag_size < 0) {
@@ -155,8 +168,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
}
set_nlink(inode, 1);
- inode->i_size = le32_to_cpu(sqsh_ino->file_size);
- inode->i_fop = &generic_ro_fops;
+ inode->i_fop = &squashfs_file_operations;
inode->i_mode |= S_IFREG;
inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
squashfs_i(inode)->fragment_block = frag_blk;
@@ -165,6 +177,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
squashfs_i(inode)->block_list_start = block;
squashfs_i(inode)->offset = offset;
+ squashfs_i(inode)->parent = 0;
inode->i_data.a_ops = &squashfs_aops;
TRACE("File inode %x:%x, start_block %llx, block_list_start "
@@ -183,8 +196,21 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
+ inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+ if (inode->i_size < 0) {
+ err = -EINVAL;
+ goto failed_read;
+ }
frag = le32_to_cpu(sqsh_ino->fragment);
if (frag != SQUASHFS_INVALID_FRAG) {
+ /*
+ * the file cannot have a fragment (tailend) and have a
+ * file size a multiple of the block size
+ */
+ if ((inode->i_size & (msblk->block_size - 1)) == 0) {
+ err = -EINVAL;
+ goto failed_read;
+ }
frag_offset = le32_to_cpu(sqsh_ino->offset);
frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
if (frag_size < 0) {
@@ -199,9 +225,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
xattr_id = le32_to_cpu(sqsh_ino->xattr);
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
- inode->i_size = le64_to_cpu(sqsh_ino->file_size);
inode->i_op = &squashfs_inode_ops;
- inode->i_fop = &generic_ro_fops;
+ inode->i_fop = &squashfs_file_operations;
inode->i_mode |= S_IFREG;
inode->i_blocks = (inode->i_size -
le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
@@ -212,6 +237,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
squashfs_i(inode)->block_list_start = block;
squashfs_i(inode)->offset = offset;
+ squashfs_i(inode)->parent = 0;
inode->i_data.a_ops = &squashfs_aops;
TRACE("File inode %x:%x, start_block %llx, block_list_start "
@@ -292,6 +318,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
squashfs_i(inode)->offset = offset;
+ squashfs_i(inode)->parent = 0;
if (type == SQUASHFS_LSYMLINK_TYPE) {
__le32 xattr;
@@ -329,6 +356,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
rdev = le32_to_cpu(sqsh_ino->rdev);
init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+ squashfs_i(inode)->parent = 0;
TRACE("Device inode %x:%x, rdev %x\n",
SQUASHFS_INODE_BLK(ino), offset, rdev);
@@ -353,6 +381,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
rdev = le32_to_cpu(sqsh_ino->rdev);
init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+ squashfs_i(inode)->parent = 0;
TRACE("Device inode %x:%x, rdev %x\n",
SQUASHFS_INODE_BLK(ino), offset, rdev);
@@ -373,6 +402,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFSOCK;
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
init_special_inode(inode, inode->i_mode, 0);
+ squashfs_i(inode)->parent = 0;
break;
}
case SQUASHFS_LFIFO_TYPE:
@@ -392,6 +422,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_op = &squashfs_inode_ops;
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
init_special_inode(inode, inode->i_mode, 0);
+ squashfs_i(inode)->parent = 0;
break;
}
default:
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 218868b20f16..4851bd964502 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -107,6 +107,7 @@ extern const struct address_space_operations squashfs_aops;
/* inode.c */
extern const struct inode_operations squashfs_inode_ops;
+extern const struct file_operations squashfs_file_operations;
/* namei.c */
extern const struct inode_operations squashfs_dir_inode_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 95f8e8901768..a955d9369749 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -208,6 +208,7 @@ static inline int squashfs_block_size(__le32 raw)
#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
#define SQUASHFS_META_ENTRIES 127
#define SQUASHFS_META_SLOTS 8
+#define SQUASHFS_SCAN_INDEXES 1024
struct meta_entry {
u64 data_block;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index 2c82d6f2a456..8e497ac07b9a 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -16,6 +16,7 @@ struct squashfs_inode_info {
u64 xattr;
unsigned int xattr_size;
int xattr_count;
+ int parent;
union {
struct {
u64 fragment_block;
@@ -27,7 +28,6 @@ struct squashfs_inode_info {
u64 dir_idx_start;
int dir_idx_offset;
int dir_idx_cnt;
- int parent;
};
};
struct inode vfs_inode;
diff --git a/fs/super.c b/fs/super.c
index f4fa0e93c463..5bab94fb7e03 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
if (!s)
return NULL;
- INIT_LIST_HEAD(&s->s_mounts);
s->s_user_ns = get_user_ns(user_ns);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -408,7 +407,7 @@ static void __put_super(struct super_block *s)
list_del_init(&s->s_list);
WARN_ON(s->s_dentry_lru.node);
WARN_ON(s->s_inode_lru.node);
- WARN_ON(!list_empty(&s->s_mounts));
+ WARN_ON(s->s_mounts);
call_rcu(&s->rcu, destroy_super_rcu);
}
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f24aa98e6869..a79d73f28aa7 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -2272,6 +2272,9 @@ int udf_current_aext(struct inode *inode, struct extent_position *epos,
if (check_add_overflow(sizeof(struct allocExtDesc),
le32_to_cpu(header->lengthAllocDescs), &alen))
return -1;
+
+ if (alen > epos->bh->b_size)
+ return -1;
}
switch (iinfo->i_alloc_type) {
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 770e29ec3557..42bedc4ec7af 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -315,46 +315,39 @@ static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
{
struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
struct vboxsf_handle *sf_handle;
- struct dentry *res = NULL;
u64 handle;
int err;
if (d_in_lookup(dentry)) {
- res = vboxsf_dir_lookup(parent, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = vboxsf_dir_lookup(parent, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
err = vboxsf_dir_create(parent, dentry, mode, false, flags & O_EXCL, &handle);
if (err)
- goto out;
+ return err;
sf_handle = vboxsf_create_sf_handle(d_inode(dentry), handle, SHFL_CF_ACCESS_READWRITE);
if (IS_ERR(sf_handle)) {
vboxsf_close(sbi->root, handle);
- err = PTR_ERR(sf_handle);
- goto out;
+ return PTR_ERR(sf_handle);
}
err = finish_open(file, dentry, generic_file_open);
if (err) {
/* This also closes the handle passed to vboxsf_create_sf_handle() */
vboxsf_release_sf_handle(d_inode(dentry), sf_handle);
- goto out;
+ return err;
}
file->private_data = sf_handle;
file->f_mode |= FMODE_CREATED;
-out:
- dput(res);
- return err;
+ return 0;
}
static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e721148c95d0..3e64f14739dd 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb,
percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}
-static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
-{
- wb_stat_mod(wb, item, 1);
-}
-
-static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
-{
- wb_stat_mod(wb, item, -1);
-}
-
static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
return percpu_counter_read_positive(&wb->stat[item]);
@@ -118,12 +108,10 @@ int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit
*
* BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages
* should contribute to accounting
- * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages
* BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold
*/
#define BDI_CAP_WRITEBACK (1 << 0)
-#define BDI_CAP_WRITEBACK_ACCT (1 << 1)
-#define BDI_CAP_STRICTLIMIT (1 << 2)
+#define BDI_CAP_STRICTLIMIT (1 << 1)
extern struct backing_dev_info noop_backing_dev_info;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index cc3e1c1a3454..c83e02b94389 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -95,7 +95,10 @@ struct dentry {
seqcount_spinlock_t d_seq; /* per dentry seqlock */
struct hlist_bl_node d_hash; /* lookup hash list */
struct dentry *d_parent; /* parent directory */
- struct qstr d_name;
+ union {
+ struct qstr __d_name; /* for use ONLY in fs/dcache.c */
+ const struct qstr d_name;
+ };
struct inode *d_inode; /* Where the name belongs to - NULL is
* negative */
union shortname_store d_shortname;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 2f8b8bfc0e73..6afb4a13b81d 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -79,6 +79,7 @@ enum stop_cp_reason {
STOP_CP_REASON_FLUSH_FAIL,
STOP_CP_REASON_NO_SEGMENT,
STOP_CP_REASON_CORRUPTED_FREE_BITMAP,
+ STOP_CP_REASON_CORRUPTED_NID,
STOP_CP_REASON_MAX,
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75fb216b0f7a..1f4a1c570a2a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1434,6 +1434,8 @@ struct sb_writers {
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
+struct mount;
+
struct super_block {
struct list_head s_list; /* Keep this first */
dev_t s_dev; /* search index; _not_ kdev_t */
@@ -1468,7 +1470,7 @@ struct super_block {
__u16 s_encoding_flags;
#endif
struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
- struct list_head s_mounts; /* list of mounts; _not_ for fs use */
+ struct mount *s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */
struct file *s_bdev_file;
struct backing_dev_info *s_bdi;
@@ -3714,7 +3716,8 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
* happens when a directory is casefolded and the filesystem is strict
* about its encoding.
*/
-static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+static inline bool generic_ci_validate_strict_name(struct inode *dir,
+ const struct qstr *name)
{
if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
return true;
@@ -3729,18 +3732,42 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qst
return !utf8_validate(dir->i_sb->s_encoding, name);
}
#else
-static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+static inline bool generic_ci_validate_strict_name(struct inode *dir,
+ const struct qstr *name)
{
return true;
}
#endif
+static inline struct unicode_map *sb_encoding(const struct super_block *sb)
+{
+#if IS_ENABLED(CONFIG_UNICODE)
+ return sb->s_encoding;
+#else
+ return NULL;
+#endif
+}
+
static inline bool sb_has_encoding(const struct super_block *sb)
{
+ return !!sb_encoding(sb);
+}
+
+/*
+ * Compare if two super blocks have the same encoding and flags
+ */
+static inline bool sb_same_encoding(const struct super_block *sb1,
+ const struct super_block *sb2)
+{
#if IS_ENABLED(CONFIG_UNICODE)
- return !!sb->s_encoding;
+ if (sb1->s_encoding == sb2->s_encoding)
+ return true;
+
+ return (sb1->s_encoding && sb2->s_encoding &&
+ (sb1->s_encoding->version == sb2->s_encoding->version) &&
+ (sb1->s_encoding_flags == sb2->s_encoding_flags));
#else
- return false;
+ return true;
#endif
}
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 671f031be173..0d6c8a6d7be2 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -134,8 +134,13 @@ extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_ty
extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
-extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
- const char *value, size_t v_size);
+extern int vfs_parse_fs_qstr(struct fs_context *fc, const char *key,
+ const struct qstr *value);
+static inline int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+ const char *value)
+{
+ return vfs_parse_fs_qstr(fc, key, value ? &QSTR(value) : NULL);
+}
int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
char *(*sep)(char **));
extern int generic_parse_monolithic(struct fs_context *fc, void *data);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d4034ddaf392..0d954ea7b179 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -273,6 +273,8 @@ struct fsnotify_group {
int f_flags; /* event_f_flags from fanotify_init() */
struct ucounts *ucounts;
mempool_t error_events_pool;
+ /* chained on perm_group_list */
+ struct list_head perm_grp_list;
} fanotify_data;
#endif /* CONFIG_FANOTIFY */
};
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 2267902d29a7..789e23e67444 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -334,14 +334,6 @@ static inline void ida_init(struct ida *ida)
xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}
-/*
- * ida_simple_get() and ida_simple_remove() are deprecated. Use
- * ida_alloc() and ida_free() instead respectively.
- */
-#define ida_simple_get(ida, start, end, gfp) \
- ida_alloc_range(ida, start, (end) - 1, gfp)
-#define ida_simple_remove(ida, id) ida_free(ida, id)
-
static inline bool ida_is_empty(const struct ida *ida)
{
return xa_empty(&ida->xa);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 989315dabb86..5b46924fdff5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -164,11 +164,23 @@ extern int root_mountflags;
extern bool early_boot_irqs_disabled;
-/*
- * Values used for system_state. Ordering of the states must not be changed
+/**
+ * enum system_states - Values used for system_state.
+ *
+ * @SYSTEM_BOOTING: %0, no init needed
+ * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU
+ * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running
+ * @SYSTEM_RUNNING: system is up and running
+ * @SYSTEM_HALT: system entered clean system halt state
+ * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state
+ * @SYSTEM_RESTART: system entered emergency power off or normal restart
+ * @SYSTEM_SUSPEND: system entered suspend or hibernate state
+ *
+ * Note:
+ * Ordering of the states must not be changed
* as code checks for <, <=, >, >= STATE.
*/
-extern enum system_states {
+enum system_states {
SYSTEM_BOOTING,
SYSTEM_SCHEDULING,
SYSTEM_FREEING_INITMEM,
@@ -177,7 +189,8 @@ extern enum system_states {
SYSTEM_POWER_OFF,
SYSTEM_RESTART,
SYSTEM_SUSPEND,
-} system_state;
+};
+extern enum system_states system_state;
/*
* General tracing related utility functions - trace_printk(),
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 39fe3e6cd282..ff7e231b0485 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -395,6 +395,9 @@ struct kimage {
/* Information for loading purgatory */
struct purgatory_info purgatory_info;
+
+ /* Force carrying over the DTB from the current boot */
+ bool force_dtb;
#endif
#ifdef CONFIG_CRASH_HOTPLUG
@@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type);
/* List of defined/legal kexec file flags */
#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \
- KEXEC_FILE_NO_CMA)
+ KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB)
/* flag to track if kexec reboot is in progress */
extern bool kexec_in_progress;
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 348844cffb13..559d13a3bc44 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -40,6 +40,7 @@ struct kho_serialization;
#ifdef CONFIG_KEXEC_HANDOVER
bool kho_is_enabled(void);
+bool is_kho_boot(void);
int kho_preserve_folio(struct folio *folio);
int kho_preserve_phys(phys_addr_t phys, size_t size);
@@ -60,6 +61,11 @@ static inline bool kho_is_enabled(void)
return false;
}
+static inline bool is_kho_boot(void)
+{
+ return false;
+}
+
static inline int kho_preserve_folio(struct folio *folio)
{
return -EOPNOTSUPP;
diff --git a/include/linux/list.h b/include/linux/list.h
index 7f7657e41620..5bfda2f91fca 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -20,8 +20,16 @@
* using the generic single-entry routines.
*/
+/**
+ * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself
+ * @name: name of the list_head
+ */
#define LIST_HEAD_INIT(name) { &(name), &(name) }
+/**
+ * LIST_HEAD - definition of a &struct list_head with initialization values
+ * @name: name of the list_head
+ */
#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index adbe234a6f6c..8c42b4bde09c 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -85,7 +85,7 @@ LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry,
int mode, const struct qstr *name, const char **xattr_name,
struct lsm_context *cp)
LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode,
- struct qstr *name, const struct cred *old, struct cred *new)
+ const struct qstr *name, const struct cred *old, struct cred *new)
#ifdef CONFIG_SECURITY_PATH
LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry)
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 3a25122d83e2..6907aedc4f74 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -349,6 +349,19 @@ static inline void kernel_param_unlock(struct module *mod)
__module_param_call("", name, &param_ops_##type, &var, perm, \
-1, KERNEL_PARAM_FL_UNSAFE)
+/**
+ * __core_param_cb - similar like core_param, with a set/get ops instead of type.
+ * @name: the name of the cmdline and sysfs parameter (often the same as var)
+ * @var: the variable
+ * @ops: the set & get operations for this parameter.
+ * @perm: visibility in sysfs
+ *
+ * Ideally this should be called 'core_param_cb', but the name has been
+ * used for module core parameter, so add the '__' prefix
+ */
+#define __core_param_cb(name, ops, arg, perm) \
+ __module_param_call("", name, ops, arg, perm, -1, 0)
+
#endif /* !MODULE */
/**
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5f9c053b0897..acfe7ef86a1b 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -33,7 +33,6 @@ enum mount_flags {
MNT_NOSYMFOLLOW = 0x80,
MNT_SHRINKABLE = 0x100,
- MNT_WRITE_HOLD = 0x200,
MNT_INTERNAL = 0x4000,
@@ -52,7 +51,7 @@ enum mount_flags {
| MNT_READONLY | MNT_NOSYMFOLLOW,
MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,
- MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED |
+ MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED |
MNT_SYNC_UMOUNT | MNT_LOCKED
};
@@ -77,7 +76,7 @@ extern void mntput(struct vfsmount *mnt);
extern struct vfsmount *mntget(struct vfsmount *mnt);
extern void mnt_make_shortterm(struct vfsmount *mnt);
extern struct vfsmount *mnt_clone_internal(const struct path *path);
-extern bool __mnt_is_readonly(struct vfsmount *mnt);
+extern bool __mnt_is_readonly(const struct vfsmount *mnt);
extern bool mnt_may_suid(struct vfsmount *mnt);
extern struct vfsmount *clone_private_mount(const struct path *path);
@@ -104,8 +103,8 @@ extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
int do_mount(const char *, const char __user *,
const char *, unsigned long, void *);
-extern struct path *collect_paths(const struct path *, struct path *, unsigned);
-extern void drop_collected_paths(struct path *, struct path *);
+extern const struct path *collect_paths(const struct path *, struct path *, unsigned);
+extern void drop_collected_paths(const struct path *, const struct path *);
extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num);
extern int cifs_root_data(char **dev, char **opts);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 9aed39abc94b..afe1d8f09d89 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -122,8 +122,6 @@ struct nfs_pageio_descriptor {
/* arbitrarily selected limit to number of mirrors */
#define NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX 16
-#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
-
extern struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx,
struct page *page,
unsigned int pgbase,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ac4bff6e9913..d56583572c98 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -862,7 +862,7 @@ struct nfs_getaclres {
size_t acl_len;
size_t acl_data_offset;
int acl_flags;
- struct page * acl_scratch;
+ struct folio * acl_scratch;
};
struct nfs_setattrres {
@@ -1596,7 +1596,7 @@ struct nfs42_listxattrsargs {
struct nfs42_listxattrsres {
struct nfs4_sequence_res seq_res;
- struct page *scratch;
+ struct folio *scratch;
void *xattr_buf;
size_t xattr_len;
u64 cookie;
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 5c7c92659e73..7ca2715edccc 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -65,6 +65,8 @@ struct nfsd_localio_operations {
struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **);
struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *);
struct file *(*nfsd_file_file)(struct nfsd_file *);
+ void (*nfsd_file_dio_alignment)(struct nfsd_file *,
+ u32 *, u32 *, u32 *);
} ____cacheline_aligned;
extern void nfsd_localio_ops_init(void);
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 615a560d9edb..f3b13da78aac 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -103,7 +103,7 @@ struct nvmem_cell_info {
*
* Note: A default "nvmem<id>" name will be assigned to the device if
* no name is specified in its configuration. In such case "<id>" is
- * generated with ida_simple_get() and provided id field is ignored.
+ * generated with ida_alloc() and provided id field is ignored.
*
* Note: Specifying name and setting id to -1 implies a unique device
* whose name is provided as-is (kept unaltered).
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 185644e288ea..09b581c1d878 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1229,6 +1229,8 @@ void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
void end_page_writeback(struct page *page);
void folio_end_writeback(struct folio *folio);
+void folio_end_writeback_no_dropbehind(struct folio *folio);
+void folio_end_dropbehind(struct folio *folio);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
diff --git a/include/linux/panic.h b/include/linux/panic.h
index 7be742628c25..6f972a66c13e 100644
--- a/include/linux/panic.h
+++ b/include/linux/panic.h
@@ -43,6 +43,12 @@ void abort(void);
extern atomic_t panic_cpu;
#define PANIC_CPU_INVALID -1
+bool panic_try_start(void);
+void panic_reset(void);
+bool panic_in_progress(void);
+bool panic_on_this_cpu(void);
+bool panic_on_other_cpu(void);
+
/*
* Only to be used by arch init code. If the user over-wrote the default
* CONFIG_PANIC_TIMEOUT, honor it.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 5d22b803f51e..45c663124c9b 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -330,8 +330,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress)
#endif
-bool this_cpu_in_panic(void);
-
#ifdef CONFIG_SMP
extern int __printk_cpu_sync_try_get(void);
extern void __printk_cpu_sync_wait(void);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 34d6a0e108c3..525aa2a632b2 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -210,9 +210,8 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
* pins the final release of task.io_context. Also protects ->cpuset and
* ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
- * Nests both inside and outside of read_lock(&tasklist_lock).
- * It must not be nested with write_lock_irq(&tasklist_lock),
- * neither inside nor outside.
+ * Nests inside of read_lock(&tasklist_lock). It must not be nested with
+ * write_lock_irq(&tasklist_lock), neither inside nor outside.
*/
static inline void task_lock(struct task_struct *p)
{
diff --git a/include/linux/security.h b/include/linux/security.h
index bd33f194c94a..92ac3f27b973 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -391,7 +391,7 @@ int security_dentry_init_security(struct dentry *dentry, int mode,
const char **xattr_name,
struct lsm_context *lsmcxt);
int security_dentry_create_files_as(struct dentry *dentry, int mode,
- struct qstr *name,
+ const struct qstr *name,
const struct cred *old,
struct cred *new);
int security_path_notify(const struct path *path, u64 mask,
@@ -872,7 +872,7 @@ static inline int security_dentry_init_security(struct dentry *dentry,
}
static inline int security_dentry_create_files_as(struct dentry *dentry,
- int mode, struct qstr *name,
+ int mode, const struct qstr *name,
const struct cred *old,
struct cred *new)
{
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index f6aeed07fe04..891f6173c951 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -23,43 +23,30 @@ extern unsigned int nlm_debug;
#define dprintk(fmt, ...) \
dfprintk(FACILITY, fmt, ##__VA_ARGS__)
-#define dprintk_cont(fmt, ...) \
- dfprintk_cont(FACILITY, fmt, ##__VA_ARGS__)
#define dprintk_rcu(fmt, ...) \
dfprintk_rcu(FACILITY, fmt, ##__VA_ARGS__)
-#define dprintk_rcu_cont(fmt, ...) \
- dfprintk_rcu_cont(FACILITY, fmt, ##__VA_ARGS__)
#undef ifdebug
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac))
-# define dfprintk(fac, fmt, ...) \
-do { \
- ifdebug(fac) \
- printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \
-} while (0)
+# if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE)
+# define __sunrpc_printk(fmt, ...) trace_printk(fmt, ##__VA_ARGS__)
+# else
+# define __sunrpc_printk(fmt, ...) printk(KERN_DEFAULT fmt, ##__VA_ARGS__)
+# endif
-# define dfprintk_cont(fac, fmt, ...) \
+# define dfprintk(fac, fmt, ...) \
do { \
ifdebug(fac) \
- printk(KERN_CONT fmt, ##__VA_ARGS__); \
+ __sunrpc_printk(fmt, ##__VA_ARGS__); \
} while (0)
# define dfprintk_rcu(fac, fmt, ...) \
do { \
ifdebug(fac) { \
rcu_read_lock(); \
- printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \
- rcu_read_unlock(); \
- } \
-} while (0)
-
-# define dfprintk_rcu_cont(fac, fmt, ...) \
-do { \
- ifdebug(fac) { \
- rcu_read_lock(); \
- printk(KERN_CONT fmt, ##__VA_ARGS__); \
+ __sunrpc_printk(fmt, ##__VA_ARGS__); \
rcu_read_unlock(); \
} \
} while (0)
@@ -68,7 +55,6 @@ do { \
#else
# define ifdebug(fac) if (0)
# define dfprintk(fac, fmt, ...) do {} while (0)
-# define dfprintk_cont(fac, fmt, ...) do {} while (0)
# define dfprintk_rcu(fac, fmt, ...) do {} while (0)
# define RPC_IFDEBUG(x)
#endif
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 40cbe81360ed..5506d20857c3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -196,7 +196,7 @@ struct svc_rqst {
struct xdr_buf rq_arg;
struct xdr_stream rq_arg_stream;
struct xdr_stream rq_res_stream;
- struct page *rq_scratch_page;
+ struct folio *rq_scratch_folio;
struct xdr_buf rq_res;
unsigned long rq_maxpages; /* num of entries in rq_pages */
struct page * *rq_pages;
@@ -503,7 +503,7 @@ static inline void svcxdr_init_decode(struct svc_rqst *rqstp)
buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len;
xdr_init_decode(xdr, buf, argv->iov_base, NULL);
- xdr_set_scratch_page(xdr, rqstp->rq_scratch_page);
+ xdr_set_scratch_folio(xdr, rqstp->rq_scratch_folio);
}
/**
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 369a89aea186..fde60d4e2cd5 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -165,7 +165,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
const unsigned short port, int flags,
const struct cred *cred);
-void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net);
+void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net,
+ bool unregister);
void svc_xprt_received(struct svc_xprt *xprt);
void svc_xprt_enqueue(struct svc_xprt *xprt);
void svc_xprt_put(struct svc_xprt *xprt);
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8a9ec617cf66..49278749ad0c 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -288,16 +288,16 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
}
/**
- * xdr_set_scratch_page - Attach a scratch buffer for decoding data
+ * xdr_set_scratch_folio - Attach a scratch buffer for decoding data
* @xdr: pointer to xdr_stream struct
- * @page: an anonymous page
+ * @page: an anonymous folio
*
* See xdr_set_scratch_buffer().
*/
static inline void
-xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page)
+xdr_set_scratch_folio(struct xdr_stream *xdr, struct folio *folio)
{
- xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE);
+ xdr_set_scratch_buffer(xdr, folio_address(folio), folio_size(folio));
}
/**
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 09855d819418..f648044466d5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -965,6 +965,18 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
__ret; \
})
+#define __wait_event_state_exclusive(wq, condition, state) \
+ ___wait_event(wq, condition, state, 1, 0, schedule())
+
+#define wait_event_state_exclusive(wq, condition, state) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_state_exclusive(wq, condition, state); \
+ __ret; \
+})
+
#define __wait_event_killable_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_KILLABLE, 0, timeout, \
diff --git a/include/trace/misc/fs.h b/include/trace/misc/fs.h
index 0406ebe2a80a..7ead1c61f0cb 100644
--- a/include/trace/misc/fs.h
+++ b/include/trace/misc/fs.h
@@ -141,3 +141,25 @@
{ ATTR_TIMES_SET, "TIMES_SET" }, \
{ ATTR_TOUCH, "TOUCH"}, \
{ ATTR_DELEG, "DELEG"})
+
+#define show_statx_mask(flags) \
+ __print_flags(flags, "|", \
+ { STATX_TYPE, "TYPE" }, \
+ { STATX_MODE, "MODE" }, \
+ { STATX_NLINK, "NLINK" }, \
+ { STATX_UID, "UID" }, \
+ { STATX_GID, "GID" }, \
+ { STATX_ATIME, "ATIME" }, \
+ { STATX_MTIME, "MTIME" }, \
+ { STATX_CTIME, "CTIME" }, \
+ { STATX_INO, "INO" }, \
+ { STATX_SIZE, "SIZE" }, \
+ { STATX_BLOCKS, "BLOCKS" }, \
+ { STATX_BASIC_STATS, "BASIC_STATS" }, \
+ { STATX_BTIME, "BTIME" }, \
+ { STATX_MNT_ID, "MNT_ID" }, \
+ { STATX_DIOALIGN, "DIOALIGN" }, \
+ { STATX_MNT_ID_UNIQUE, "MNT_ID_UNIQUE" }, \
+ { STATX_SUBVOL, "SUBVOL" }, \
+ { STATX_WRITE_ATOMIC, "WRITE_ATOMIC" }, \
+ { STATX_DIO_READ_ALIGN, "DIO_READ_ALIGN" })
diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h
index 1c4c2dd29112..411dcc1e4a35 100644
--- a/include/uapi/linux/ext4.h
+++ b/include/uapi/linux/ext4.h
@@ -33,6 +33,8 @@
#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32)
#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid)
#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid)
+#define EXT4_IOC_GET_TUNE_SB_PARAM _IOR('f', 45, struct ext4_tune_sb_params)
+#define EXT4_IOC_SET_TUNE_SB_PARAM _IOW('f', 46, struct ext4_tune_sb_params)
#define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32)
@@ -108,6 +110,57 @@ struct ext4_new_group_input {
__u16 unused;
};
+struct ext4_tune_sb_params {
+ __u32 set_flags;
+ __u32 checkinterval;
+ __u16 errors_behavior;
+ __u16 mnt_count;
+ __u16 max_mnt_count;
+ __u16 raid_stride;
+ __u64 last_check_time;
+ __u64 reserved_blocks;
+ __u64 blocks_count;
+ __u32 default_mnt_opts;
+ __u32 reserved_uid;
+ __u32 reserved_gid;
+ __u32 raid_stripe_width;
+ __u16 encoding;
+ __u16 encoding_flags;
+ __u8 def_hash_alg;
+ __u8 pad_1;
+ __u16 pad_2;
+ __u32 feature_compat;
+ __u32 feature_incompat;
+ __u32 feature_ro_compat;
+ __u32 set_feature_compat_mask;
+ __u32 set_feature_incompat_mask;
+ __u32 set_feature_ro_compat_mask;
+ __u32 clear_feature_compat_mask;
+ __u32 clear_feature_incompat_mask;
+ __u32 clear_feature_ro_compat_mask;
+ __u8 mount_opts[64];
+ __u8 pad[64];
+};
+
+#define EXT4_TUNE_FL_ERRORS_BEHAVIOR 0x00000001
+#define EXT4_TUNE_FL_MNT_COUNT 0x00000002
+#define EXT4_TUNE_FL_MAX_MNT_COUNT 0x00000004
+#define EXT4_TUNE_FL_CHECKINTRVAL 0x00000008
+#define EXT4_TUNE_FL_LAST_CHECK_TIME 0x00000010
+#define EXT4_TUNE_FL_RESERVED_BLOCKS 0x00000020
+#define EXT4_TUNE_FL_RESERVED_UID 0x00000040
+#define EXT4_TUNE_FL_RESERVED_GID 0x00000080
+#define EXT4_TUNE_FL_DEFAULT_MNT_OPTS 0x00000100
+#define EXT4_TUNE_FL_DEF_HASH_ALG 0x00000200
+#define EXT4_TUNE_FL_RAID_STRIDE 0x00000400
+#define EXT4_TUNE_FL_RAID_STRIPE_WIDTH 0x00000800
+#define EXT4_TUNE_FL_MOUNT_OPTS 0x00001000
+#define EXT4_TUNE_FL_FEATURES 0x00002000
+#define EXT4_TUNE_FL_EDIT_FEATURES 0x00004000
+#define EXT4_TUNE_FL_FORCE_FSCK 0x00008000
+#define EXT4_TUNE_FL_ENCODING 0x00010000
+#define EXT4_TUNE_FL_ENCODING_FLAGS 0x00020000
+
/*
* Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
* It indicates that the entry in extent status cache is for a hole.
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 122d6586e8d4..c13e1f9a2f12 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -235,6 +235,11 @@
*
* 7.44
* - add FUSE_NOTIFY_INC_EPOCH
+ *
+ * 7.45
+ * - add FUSE_COPY_FILE_RANGE_64
+ * - add struct fuse_copy_file_range_out
+ * - add FUSE_NOTIFY_PRUNE
*/
#ifndef _LINUX_FUSE_H
@@ -270,7 +275,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 44
+#define FUSE_KERNEL_MINOR_VERSION 45
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -657,6 +662,7 @@ enum fuse_opcode {
FUSE_SYNCFS = 50,
FUSE_TMPFILE = 51,
FUSE_STATX = 52,
+ FUSE_COPY_FILE_RANGE_64 = 53,
/* CUSE specific operations */
CUSE_INIT = 4096,
@@ -675,7 +681,7 @@ enum fuse_notify_code {
FUSE_NOTIFY_DELETE = 6,
FUSE_NOTIFY_RESEND = 7,
FUSE_NOTIFY_INC_EPOCH = 8,
- FUSE_NOTIFY_CODE_MAX,
+ FUSE_NOTIFY_PRUNE = 9,
};
/* The read buffer is required to be at least 8k, but may be much larger */
@@ -1114,6 +1120,12 @@ struct fuse_notify_retrieve_in {
uint64_t dummy4;
};
+struct fuse_notify_prune_out {
+ uint32_t count;
+ uint32_t padding;
+ uint64_t spare;
+};
+
struct fuse_backing_map {
int32_t fd;
uint32_t flags;
@@ -1126,6 +1138,7 @@ struct fuse_backing_map {
#define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \
struct fuse_backing_map)
#define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t)
+#define FUSE_DEV_IOC_SYNC_INIT _IO(FUSE_DEV_IOC_MAGIC, 3)
struct fuse_lseek_in {
uint64_t fh;
@@ -1148,6 +1161,11 @@ struct fuse_copy_file_range_in {
uint64_t flags;
};
+/* For FUSE_COPY_FILE_RANGE_64 */
+struct fuse_copy_file_range_out {
+ uint64_t bytes_copied;
+};
+
#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
struct fuse_setupmapping_in {
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 8958ebfcff94..55749cb0b81d 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -22,12 +22,16 @@
* KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image.
* KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd
* fd field.
+ * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new
+ * kernel on x86. This is already the default behavior on
+ * some other architectures, like ARM64 and PowerPC.
*/
#define KEXEC_FILE_UNLOAD 0x00000001
#define KEXEC_FILE_ON_CRASH 0x00000002
#define KEXEC_FILE_NO_INITRAMFS 0x00000004
#define KEXEC_FILE_DEBUG 0x00000008
#define KEXEC_FILE_NO_CMA 0x00000010
+#define KEXEC_FILE_FORCE_DTB 0x00000020
/* These values match the ELF architecture values.
* Unless there is a good reason that should continue to be the case.
diff --git a/init/main.c b/init/main.c
index fab4f599c035..07a3116811c5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -545,6 +545,12 @@ static int __init unknown_bootoption(char *param, char *val,
const char *unused, void *arg)
{
size_t len = strlen(param);
+ /*
+ * Well-known bootloader identifiers:
+ * 1. LILO/Grub pass "BOOT_IMAGE=...";
+ * 2. kexec/kdump (kexec-tools) pass "kexec".
+ */
+ const char *bootloader[] = { "BOOT_IMAGE=", "kexec", NULL };
/* Handle params aliased to sysctls */
if (sysctl_is_alias(param))
@@ -552,6 +558,12 @@ static int __init unknown_bootoption(char *param, char *val,
repair_env_string(param, val);
+ /* Handle bootloader identifier */
+ for (int i = 0; bootloader[i]; i++) {
+ if (strstarts(param, bootloader[i]))
+ return 0;
+ }
+
/* Handle obsolete-style parameters */
if (obsolete_checksetup(param))
return 0;
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 1224dd937df0..422270d64820 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS
CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that
is required to be built-in.
+config CRASH_DUMP_KUNIT_TEST
+ tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS
+ depends on CRASH_DUMP && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This option builds KUnit unit tests for kernel crash dumps. The unit
+ tests will be used to verify the correctness of covered functions and
+ also prevent any regression.
+
+ If unsure, say N.
+
config CRASH_HOTPLUG
bool "Update the crash elfcorehdr on system configuration changes"
default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 41751834e764..df3dd8291bb6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_CRASH_DUMP) += crash_core.o
obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o
+obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1605df0a171e..fda6beb041e0 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -680,7 +680,7 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct audit_node *node;
- struct path *paths;
+ const struct path *paths;
struct path array[16];
int err;
@@ -703,7 +703,7 @@ void audit_trim_trees(void)
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
- for (struct path *p = paths; p->dentry; p++) {
+ for (const struct path *p = paths; p->dentry; p++) {
struct inode *inode = p->dentry->d_inode;
if (inode_to_key(inode) == chunk->key) {
node->index &= ~(1U<<31);
@@ -742,9 +742,9 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
-static int tag_mounts(struct path *paths, struct audit_tree *tree)
+static int tag_mounts(const struct path *paths, struct audit_tree *tree)
{
- for (struct path *p = paths; p->dentry; p++) {
+ for (const struct path *p = paths; p->dentry; p++) {
int err = tag_chunk(p->dentry->d_inode, tree);
if (err)
return err;
@@ -807,7 +807,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
struct audit_tree *seed = rule->tree, *tree;
struct path path;
struct path array[16];
- struct path *paths;
+ const struct path *paths;
int err;
rule->tree = NULL;
@@ -879,7 +879,7 @@ int audit_tag_tree(char *old, char *new)
int failed = 0;
struct path path1, path2;
struct path array[16];
- struct path *paths;
+ const struct path *paths;
int err;
err = kern_path(new, 0, &path2);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a4ef79591eb2..3b1c43382eec 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -22,6 +22,7 @@
#include <linux/btf.h>
#include <linux/objtool.h>
#include <linux/delay.h>
+#include <linux/panic.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec);
__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
- int old_cpu, this_cpu;
-
- /*
- * Only one CPU is allowed to execute the crash_kexec() code as with
- * panic(). Otherwise parallel calls of panic() and crash_kexec()
- * may stop each other. To exclude them, we use panic_cpu here too.
- */
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ if (panic_try_start()) {
/* This is the 1st CPU which comes here, so go ahead. */
__crash_kexec(regs);
@@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs)
* Reset panic_cpu to allow another panic()/crash_kexec()
* call.
*/
- atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+ panic_reset();
}
}
@@ -274,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
return 0;
}
+/**
+ * crash_exclude_mem_range - exclude a mem range for existing ranges
+ * @mem: mem->range contains an array of ranges sorted in ascending order
+ * @mstart: the start of to-be-excluded range
+ * @mend: the start of to-be-excluded range
+ *
+ * If you are unsure if a range split will happen, to avoid function call
+ * failure because of -ENOMEM, always make sure
+ * mem->max_nr_ranges == mem->nr_ranges + 1
+ * before calling the function each time.
+ *
+ * returns 0 if a memory range is excluded successfully
+ * return -ENOMEM if mem->ranges doesn't have space to hold split ranges
+ */
int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
@@ -333,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
+EXPORT_SYMBOL_GPL(crash_exclude_mem_range);
ssize_t crash_get_memory_size(void)
{
diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c
new file mode 100644
index 000000000000..8aadf6801530
--- /dev/null
+++ b/kernel/crash_core_test.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there
+
+// Helper to create and initialize crash_mem
+static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges,
+ unsigned int nr_initial_ranges,
+ const struct range *initial_ranges)
+{
+ struct crash_mem *mem;
+ size_t alloc_size;
+
+ // Check if max_ranges can even hold initial_ranges
+ if (max_ranges < nr_initial_ranges) {
+ kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n",
+ max_ranges, nr_initial_ranges);
+ return NULL;
+ }
+
+ alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range);
+ mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL);
+ if (!mem) {
+ kunit_err(test, "Failed to allocate crash_mem\n");
+ return NULL;
+ }
+
+ mem->max_nr_ranges = max_ranges;
+ mem->nr_ranges = nr_initial_ranges;
+ if (initial_ranges && nr_initial_ranges > 0) {
+ memcpy(mem->ranges, initial_ranges,
+ nr_initial_ranges * sizeof(struct range));
+ }
+
+ return mem;
+}
+
+// Helper to compare ranges for assertions
+static void assert_ranges_equal(struct kunit *test,
+ const struct range *actual_ranges,
+ unsigned int actual_nr_ranges,
+ const struct range *expected_ranges,
+ unsigned int expected_nr_ranges,
+ const char *case_name)
+{
+ unsigned int i;
+
+ KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges,
+ "%s: Number of ranges mismatch.", case_name);
+
+ for (i = 0; i < expected_nr_ranges; i++) {
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start,
+ "%s: Range %u start mismatch.", case_name, i);
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end,
+ "%s: Range %u end mismatch.", case_name, i);
+ }
+}
+
+// Structure for test parameters
+struct exclude_test_param {
+ const char *description;
+ unsigned long long exclude_start;
+ unsigned long long exclude_end;
+ unsigned int initial_max_ranges;
+ const struct range *initial_ranges;
+ unsigned int initial_nr_ranges;
+ const struct range *expected_ranges;
+ unsigned int expected_nr_ranges;
+ int expected_ret;
+};
+
+static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params)
+{
+ struct crash_mem *mem;
+ int ret;
+
+ kunit_info(test, "%s", params->description);
+
+ mem = create_crash_mem(test, params->initial_max_ranges,
+ params->initial_nr_ranges, params->initial_ranges);
+ if (!mem)
+ return; // Error already logged by create_crash_mem or kunit_kzalloc
+
+ ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end);
+
+ KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret,
+ "%s: Return value mismatch.", params->description);
+
+ if (params->expected_ret == 0) {
+ assert_ranges_equal(test, mem->ranges, mem->nr_ranges,
+ params->expected_ranges, params->expected_nr_ranges,
+ params->description);
+ } else {
+ // If an error is expected, nr_ranges might still be relevant to check
+ // depending on the exact point of failure. For ENOMEM on split,
+ // nr_ranges shouldn't have changed.
+ KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges,
+ mem->nr_ranges,
+ "%s: Number of ranges mismatch on error.",
+ params->description);
+ }
+}
+
+/*
+ * Test Strategy 1: One to-be-excluded range A and one existing range B.
+ *
+ * Exhaust all possibilities of the position of A regarding B.
+ */
+
+static const struct range single_range_b = { .start = 100, .end = 199 };
+
+static const struct exclude_test_param exclude_single_range_test_data[] = {
+ {
+ .description = "1.1: A is left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 50,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.2: A's right boundary touches B's left boundary",
+ .exclude_start = 10, .exclude_end = 99,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.3: A overlaps B's left part",
+ .exclude_start = 50, .exclude_end = 149,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.4: A is completely inside B",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 119 },
+ { .start = 180, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.5: A overlaps B's right part",
+ .exclude_start = 150, .exclude_end = 249,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.6: A's left boundary touches B's right boundary",
+ .exclude_start = 200, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.7: A is right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 300,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.8: A completely covers B and extends beyond",
+ .exclude_start = 50, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.9: A covers B and extends to the left",
+ .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.10: A covers B and extends to the right",
+ .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.11: A is identical to B",
+ .exclude_start = 100, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.12: A is a point, left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 10,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.13: A is a point, at start of B",
+ .exclude_start = 100, .exclude_end = 100,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.14: A is a point, in middle of B (causes split)",
+ .exclude_start = 150, .exclude_end = 150,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 149 },
+ { .start = 151, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.15: A is a point, at end of B",
+ .exclude_start = 199, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.16: A is a point, right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ // ENOMEM case for single range split
+ {
+ .description = "1.17: A completely inside B (split), no space (ENOMEM)",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 1, // Not enough for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 1, // Should remain unchanged
+ .expected_ret = -ENOMEM,
+ },
+};
+
+
+static void exclude_single_range_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description);
+ run_exclude_test_case(test, &exclude_single_range_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * Test Strategy 2: Regression test.
+ */
+
+static const struct exclude_test_param exclude_range_regression_test_data[] = {
+ // Test data from commit a2e9a95d2190
+ {
+ .description = "2.1: exclude low 1M",
+ .exclude_start = 0, .exclude_end = (1 << 20) - 1,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 0, .end = 0x3efff },
+ { .start = 0x3f000, .end = 0x3ffff },
+ { .start = 0x40000, .end = 0x9ffff }
+ },
+ .initial_nr_ranges = 3,
+ .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u
+ {
+ .description = "2.2: when range out of bound",
+ .exclude_start = 100, .exclude_end = 200,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 1, .end = 299 },
+ { .start = 401, .end = 1000 },
+ { .start = 1001, .end = 2000 }
+ },
+ .initial_nr_ranges = 3,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 3, // Should remain unchanged
+ .expected_ret = -ENOMEM
+ },
+
+};
+
+
+static void exclude_range_regression_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description);
+ run_exclude_test_case(test, &exclude_range_regression_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * KUnit Test Suite
+ */
+static struct kunit_case crash_exclude_mem_range_test_cases[] = {
+ KUNIT_CASE(exclude_single_range_test),
+ KUNIT_CASE(exclude_range_regression_test),
+ {}
+};
+
+static struct kunit_suite crash_exclude_mem_range_suite = {
+ .name = "crash_exclude_mem_range_tests",
+ .test_cases = crash_exclude_mem_range_test_cases,
+ // .init and .exit can be NULL if not needed globally for the suite
+};
+
+kunit_test_suite(crash_exclude_mem_range_suite);
+
+MODULE_DESCRIPTION("crash dump KUnit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/fork.c b/kernel/fork.c
index f1688b3e79a6..3da0f08615a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2132,9 +2132,7 @@ __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
-#ifdef CONFIG_LOCKDEP
lockdep_init_task(p);
-#endif
p->blocked_on = NULL; /* not blocked yet */
@@ -2547,11 +2545,9 @@ struct task_struct * __init fork_idle(int cpu)
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
- CLONE_IO;
+ CLONE_IO|CLONE_VM|CLONE_UNTRACED;
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = flags,
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
@@ -2663,9 +2659,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.name = name,
@@ -2681,9 +2676,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
};
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 8708a1205f82..b2c1f14b8129 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -95,9 +95,41 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
+static bool task_is_hung(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+ unsigned int state = READ_ONCE(t->__state);
+
+ /*
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer
+ */
+ if (!(state & TASK_UNINTERRUPTIBLE) ||
+ (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN)))
+ return false;
+
+ /*
+ * When a freshly created task is scheduled once, changes its state to
+ * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
+ * musn't be checked.
+ */
+ if (unlikely(!switch_count))
+ return false;
+
+ if (switch_count != t->last_switch_count) {
+ t->last_switch_count = switch_count;
+ t->last_switch_time = jiffies;
+ return false;
+ }
+ if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ return false;
+
+ return true;
+}
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
-static void debug_show_blocker(struct task_struct *task)
+static void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
struct task_struct *g, *t;
unsigned long owner, blocker, blocker_type;
@@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task)
t->pid, rwsem_blocked_by);
break;
}
- sched_show_task(t);
+ /* Avoid duplicated task dump, skip if the task is also hung. */
+ if (!task_is_hung(t, timeout))
+ sched_show_task(t);
return;
}
}
#else
-static inline void debug_show_blocker(struct task_struct *task)
+static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
}
#endif
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- /*
- * Ensure the task is not frozen.
- * Also, skip vfork and any other user process that freezer should skip.
- */
- if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
- return;
-
- /*
- * When a freshly created task is scheduled once, changes its state to
- * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
- * musn't be checked.
- */
- if (unlikely(!switch_count))
- return;
-
- if (switch_count != t->last_switch_count) {
- t->last_switch_count = switch_count;
- t->last_switch_time = jiffies;
- return;
- }
- if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ if (!task_is_hung(t, timeout))
return;
/*
@@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- debug_show_blocker(t);
+ debug_show_blocker(t, timeout);
hung_task_show_lock = true;
if (sysctl_hung_task_all_cpu_backtrace)
@@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
- unsigned int state;
if (!max_count--)
goto unlock;
@@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
- /*
- * skip the TASK_KILLABLE tasks -- these can be killed
- * skip the TASK_IDLE tasks -- those are genuinely idle
- */
- state = READ_ONCE(t->__state);
- if ((state & TASK_UNINTERRUPTIBLE) &&
- !(state & TASK_WAKEKILL) &&
- !(state & TASK_NOLOAD))
- check_hung_task(t, timeout);
+
+ check_hung_task(t, timeout);
}
unlock:
rcu_read_unlock();
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index cf4af5728307..2b082a7e24a2 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void)
char namebuf[KSYM_NAME_LEN];
struct test_stat *stat, *stat2;
- stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL);
if (!stat)
return -ENOMEM;
stat2 = stat + 1;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 1d85597057e1..6563141f5de9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area,
memcpy(dst_entries, src_entries, bytes_to_move);
entries_moved = bytes_to_move >> entry_size_log;
+ /*
+ * A write memory barrier is required here, to ensure
+ * that the writes from the memcpy() are visible before
+ * the count is updated. Without this, it is possible for
+ * a user to observe a new count value but stale
+ * coverage data.
+ */
+ smp_wmb();
+
switch (mode) {
case KCOV_MODE_TRACE_PC:
WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 31203f0bacaf..fa00b239c5d9 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void)
if (!image)
return NULL;
- image->head = 0;
image->entry = &image->head;
image->last_entry = &image->head;
image->control_page = ~0; /* By default this does not apply */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 91d46502a817..eb62a9794242 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
}
image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+ image->force_dtb = flags & KEXEC_FILE_FORCE_DTB;
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 555488eb1a18..5083c68c3a4e 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -988,6 +988,26 @@ static const void *kho_get_fdt(void)
}
/**
+ * is_kho_boot - check if current kernel was booted via KHO-enabled
+ * kexec
+ *
+ * This function checks if the current kernel was loaded through a kexec
+ * operation with KHO enabled, by verifying that a valid KHO FDT
+ * was passed.
+ *
+ * Note: This function returns reliable results only after
+ * kho_populate() has been called during early boot. Before that,
+ * it may return false even if KHO data is present.
+ *
+ * Return: true if booted via KHO-enabled kexec, false otherwise
+ */
+bool is_kho_boot(void)
+{
+ return !!kho_get_fdt();
+}
+EXPORT_SYMBOL_GPL(is_kho_boot);
+
+/**
* kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
* @name: the name of the sub FDT passed to kho_add_subtree().
* @phys: if found, the physical address of the sub FDT is stored in @phys.
@@ -1269,7 +1289,7 @@ int kho_fill_kimage(struct kimage *image)
int err = 0;
struct kexec_buf scratch;
- if (!kho_enable)
+ if (!kho_out.finalized)
return 0;
image->kho.fdt = page_to_phys(kho_out.ser.fdt);
diff --git a/kernel/panic.c b/kernel/panic.c
index 72fcbb5a071b..24cc3eec1805 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
-int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
+int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS);
static unsigned long tainted_mask =
IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
@@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly;
static bool panic_console_replay;
bool panic_triggering_all_cpu_backtrace;
+static bool panic_this_cpu_backtrace_printed;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -77,6 +78,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+static void panic_print_deprecated(void)
+{
+ pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n");
+}
+
#ifdef CONFIG_SYSCTL
/*
@@ -125,7 +131,7 @@ static int proc_taint(const struct ctl_table *table, int write,
static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n");
+ panic_print_deprecated();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
@@ -294,6 +300,59 @@ void __weak crash_smp_send_stop(void)
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+bool panic_try_start(void)
+{
+ int old_cpu, this_cpu;
+
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
+
+ return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu);
+}
+EXPORT_SYMBOL(panic_try_start);
+
+void panic_reset(void)
+{
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_reset);
+
+bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_in_progress);
+
+/* Return true if a panic is in progress on the current CPU. */
+bool panic_on_this_cpu(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+EXPORT_SYMBOL(panic_on_this_cpu);
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool panic_on_other_cpu(void)
+{
+ return (panic_in_progress() && !panic_on_this_cpu());
+}
+EXPORT_SYMBOL(panic_on_other_cpu);
+
/*
* A variant of panic() called from NMI context. We return if we've already
* panicked on this CPU. If another CPU already panicked, loop in
@@ -302,15 +361,9 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
*/
void nmi_panic(struct pt_regs *regs, const char *msg)
{
- int old_cpu, this_cpu;
-
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- /* atomic_try_cmpxchg updates old_cpu on failure */
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu))
+ if (panic_try_start())
panic("%s", msg);
- else if (old_cpu != this_cpu)
+ else if (panic_on_other_cpu())
nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);
@@ -328,6 +381,19 @@ void check_panic_on_warn(const char *origin)
origin, limit);
}
+static void panic_trigger_all_cpu_backtrace(void)
+{
+ /* Temporary allow non-panic CPUs to write their backtraces. */
+ panic_triggering_all_cpu_backtrace = true;
+
+ if (panic_this_cpu_backtrace_printed)
+ trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id());
+ else
+ trigger_all_cpu_backtrace();
+
+ panic_triggering_all_cpu_backtrace = false;
+}
+
/*
* Helper that triggers the NMI backtrace (if set in panic_print)
* and then performs the secondary CPUs shutdown - we cannot have
@@ -335,12 +401,8 @@ void check_panic_on_warn(const char *origin)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & SYS_INFO_ALL_CPU_BT) {
- /* Temporary allow non-panic CPUs to write their backtraces. */
- panic_triggering_all_cpu_backtrace = true;
- trigger_all_cpu_backtrace();
- panic_triggering_all_cpu_backtrace = false;
- }
+ if (panic_print & SYS_INFO_ALL_CPU_BT)
+ panic_trigger_all_cpu_backtrace();
/*
* Note that smp_send_stop() is the usual SMP shutdown function,
@@ -368,7 +430,6 @@ void vpanic(const char *fmt, va_list args)
static char buf[1024];
long i, i_next = 0, len;
int state = 0;
- int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
if (panic_on_warn) {
@@ -405,13 +466,10 @@ void vpanic(const char *fmt, va_list args)
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
/* atomic_try_cmpxchg updates old_cpu on failure */
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ if (panic_try_start()) {
/* go ahead */
- } else if (old_cpu != this_cpu)
+ } else if (panic_on_other_cpu())
panic_smp_self_stop();
console_verbose();
@@ -422,13 +480,15 @@ void vpanic(const char *fmt, va_list args)
buf[len - 1] = '\0';
pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
+ panic_this_cpu_backtrace_printed = true;
+ } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
dump_stack();
-#endif
+ panic_this_cpu_backtrace_printed = true;
+ }
/*
* If kgdb is enabled, give it a chance to run before we stop all
@@ -937,12 +997,29 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
core_param(panic, panic_timeout, int, 0644);
-core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
core_param(panic_console_replay, panic_console_replay, bool, 0644);
+static int panic_print_set(const char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_set_ulong(val, kp);
+}
+
+static int panic_print_get(char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_get_ulong(val, kp);
+}
+
+static const struct kernel_param_ops panic_print_ops = {
+ .set = panic_print_set,
+ .get = panic_print_get,
+};
+__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644);
+
static int __init oops_setup(char *s)
{
if (!s)
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index ef282001f200..f72bbfa266d6 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -332,7 +332,6 @@ struct printk_message {
unsigned long dropped;
};
-bool other_cpu_in_panic(void);
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
bool is_extended, bool may_supress);
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 646801813415..558ef3177976 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -12,6 +12,7 @@
#include <linux/irqflags.h>
#include <linux/kthread.h>
#include <linux/minmax.h>
+#include <linux/panic.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/slab.h>
@@ -254,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
* opportunity to perform any necessary cleanup if they were
* interrupted by the panic CPU while printing.
*/
- if (other_cpu_in_panic() &&
+ if (panic_on_other_cpu() &&
(!is_reacquire || cur->unsafe_takeover)) {
return -EPERM;
}
@@ -309,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
* Event #2 implies the new context is PANIC.
* Event #3 occurs when panic() has flushed the console.
* Event #4 occurs when a non-panic CPU reacquires.
- * Event #5 is not possible due to the other_cpu_in_panic() check
+ * Event #5 is not possible due to the panic_on_other_cpu() check
* in nbcon_context_try_acquire_handover().
*/
@@ -348,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
struct nbcon_state new;
/* Note that the caller must still remove the request! */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return -EPERM;
/*
@@ -446,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
* nbcon_waiter_matches(). In particular, the assumption that
* lower priorities are ignored during panic.
*/
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return -EPERM;
/* Handover is not possible on the same CPU. */
@@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs;
*/
static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
- unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
struct nbcon_state cur;
int err;
@@ -614,7 +614,7 @@ out:
/* Acquire succeeded. */
/* Assign the appropriate buffer for this context. */
- if (atomic_read(&panic_cpu) == cpu)
+ if (panic_on_this_cpu())
ctxt->pbufs = &panic_nbcon_pbufs;
else
ctxt->pbufs = con->pbufs;
@@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void)
{
unsigned int *cpu_emergency_nesting;
- if (this_cpu_in_panic())
+ if (panic_on_this_cpu())
return NBCON_PRIO_PANIC;
cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 0efbcdda9aab..5aee9ffb16b9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
+#include <linux/panic.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -345,34 +346,6 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
-static bool panic_in_progress(void)
-{
- return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
-}
-
-/* Return true if a panic is in progress on the current CPU. */
-bool this_cpu_in_panic(void)
-{
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
-}
-
-/*
- * Return true if a panic is in progress on a remote CPU.
- *
- * On true, the local CPU should immediately release any printing resources
- * that may be needed by the panic CPU.
- */
-bool other_cpu_in_panic(void)
-{
- return (panic_in_progress() && !this_cpu_in_panic());
-}
-
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -2407,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* non-panic CPUs are generating any messages, they will be
* silently dropped.
*/
- if (other_cpu_in_panic() &&
+ if (panic_on_other_cpu() &&
!debug_non_panic_cpus &&
!panic_triggering_all_cpu_backtrace)
return 0;
@@ -2843,7 +2816,7 @@ void console_lock(void)
might_sleep();
/* On panic, the console_lock must be left to the panic cpu. */
- while (other_cpu_in_panic())
+ while (panic_on_other_cpu())
msleep(1000);
down_console_sem();
@@ -2863,7 +2836,7 @@ EXPORT_SYMBOL(console_lock);
int console_trylock(void)
{
/* On panic, the console_lock must be left to the panic cpu. */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return 0;
if (down_trylock_console_sem())
return 0;
@@ -3243,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
any_progress = true;
/* Allow panic_cpu to take over the consoles safely. */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
goto abandon;
if (do_cond_resched)
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index d9fb053cff67..e2a1b2d34d2b 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
* But it would have the sequence number returned
* by "prb_next_reserve_seq() - 1".
*/
- if (this_cpu_in_panic() &&
+ if (panic_on_this_cpu() &&
(!debug_non_panic_cpus || legacy_allow_panic_sync) &&
((*seq + 1) < prb_next_reserve_seq(rb))) {
(*seq)++;
diff --git a/kernel/sys.c b/kernel/sys.c
index a46d9b75880b..8b58eece4e58 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
struct rlimit old, new;
struct task_struct *tsk;
unsigned int checkflags = 0;
+ bool need_tasklist;
int ret;
if (old_rlim)
@@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
get_task_struct(tsk);
rcu_read_unlock();
- ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
- old_rlim ? &old : NULL);
+ need_tasklist = !same_thread_group(tsk, current);
+ if (need_tasklist) {
+ /*
+ * Ensure we can't race with group exit or de_thread(),
+ * so tsk->group_leader can't be freed or changed until
+ * read_unlock(tasklist_lock) below.
+ */
+ read_lock(&tasklist_lock);
+ if (!pid_alive(tsk))
+ ret = -ESRCH;
+ }
+
+ if (!ret) {
+ ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+ old_rlim ? &old : NULL);
+ }
+
+ if (need_tasklist)
+ read_unlock(&tasklist_lock);
if (!ret && old_rlim) {
rlim_to_rlim64(&old, &old64);
@@ -2515,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = -EINVAL;
break;
}
+ /*
+ * Ensure that either:
+ *
+ * 1. Subsequent getppid() calls reflect the parent process having died.
+ * 2. forget_original_parent() will send the new me->pdeath_signal.
+ *
+ * Also prevent the read of me->pdeath_signal from being a data race.
+ */
+ read_lock(&tasklist_lock);
me->pdeath_signal = arg2;
+ read_unlock(&tasklist_lock);
break;
case PR_GET_PDEATHSIG:
error = put_user(me->pdeath_signal, (int __user *)arg2);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b3c94fbaf002..156e7e0bf559 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -10211,8 +10211,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
- ret = vfs_parse_fs_string(fc, "source",
- "tracefs", strlen("tracefs"));
+ ret = vfs_parse_fs_string(fc, "source", "tracefs");
if (!ret)
mnt = fc_mount(fc);
else
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 80b56c002c7f..5b62d1002783 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail);
*/
static u16 get_16bit_precision(u64 data_ns)
{
- return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+ /*
+ * 2^24ns ~= 16.8ms
+ * Round to the nearest multiple of 16.8 milliseconds.
+ */
+ return (data_ns + (1 << 23)) >> 24LL;
}
static void update_cpustat(void)
@@ -444,6 +448,14 @@ static void update_cpustat(void)
old_stat = __this_cpu_read(cpustat_old[i]);
new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ /*
+ * Since we use 16-bit precision, the raw data will undergo
+ * integer division, which may sometimes result in data loss,
+ * and then result might exceed 100%. To avoid confusion,
+ * we enforce a 100% display cap when calculations exceed this threshold.
+ */
+ if (util > 100)
+ util = 100;
__this_cpu_write(cpustat_util[tail][i], util);
__this_cpu_write(cpustat_old[i], new_stat);
}
@@ -455,17 +467,17 @@ static void print_cpustat(void)
{
int i, group;
u8 tail = __this_cpu_read(cpustat_tail);
- u64 sample_period_second = sample_period;
+ u64 sample_period_msecond = sample_period;
- do_div(sample_period_second, NSEC_PER_SEC);
+ do_div(sample_period_msecond, NSEC_PER_MSEC);
/*
* Outputting the "watchdog" prefix on every line is redundant and not
* concise, and the original alarm information is sufficient for
* positioning in logs, hence here printk() is used instead of pr_crit().
*/
- printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
- smp_processor_id(), sample_period_second);
+ printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n",
+ smp_processor_id(), sample_period_msecond);
for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
group = (tail + i) % NUM_SAMPLE_PERIODS;
@@ -740,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (!watchdog_enabled)
return HRTIMER_NORESTART;
+ /*
+ * pass the buddy check if a panic is in process
+ */
+ if (panic_in_progress())
+ return HRTIMER_NORESTART;
+
watchdog_hardlockup_kick();
/* kick the softlockup detector */
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 9c58f5b4381d..d3ca70e3c256 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "NMI watchdog: " fmt
+#include <linux/panic.h>
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/module.h>
@@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
+ if (panic_in_progress())
+ return;
+
if (!watchdog_check_timestamp())
return;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cab4c7b27e54..3034e294d50d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1067,12 +1067,6 @@ config PANIC_ON_OOPS
Say N if unsure.
-config PANIC_ON_OOPS_VALUE
- int
- range 0 1
- default 0 if !PANIC_ON_OOPS
- default 1 if PANIC_ON_OOPS
-
config PANIC_TIMEOUT
int "panic timeout"
default 0
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 3ef702e6b69a..f26456988445 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -9,6 +9,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
+#include <linux/string_choices.h>
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
@@ -728,7 +729,7 @@ static int __init setup_early_mem_profiling(char *str)
}
mem_profiling_support = true;
pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n",
- compressed ? "with" : "without", enable ? "on" : "off");
+ compressed ? "with" : "without", str_on_off(enable));
}
if (enable != mem_alloc_profiling_enabled()) {
diff --git a/lib/btree.c b/lib/btree.c
index bb81d3393ac5..9c80c0c7bba8 100644
--- a/lib/btree.c
+++ b/lib/btree.c
@@ -653,9 +653,9 @@ int btree_merge(struct btree_head *target, struct btree_head *victim,
* walks to remove a single object from the victim.
*/
for (;;) {
- if (!btree_last(victim, geo, key))
+ val = btree_last(victim, geo, key);
+ if (!val)
break;
- val = btree_lookup(victim, geo, key);
err = btree_insert(target, geo, key, val, gfp);
if (err)
return err;
diff --git a/lib/decompress.c b/lib/decompress.c
index ab3fc90ffc64..7785471586c6 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -49,15 +49,15 @@ struct compress_format {
};
static const struct compress_format compressed_formats[] __initconst = {
- { {0x1f, 0x8b}, "gzip", gunzip },
- { {0x1f, 0x9e}, "gzip", gunzip },
- { {0x42, 0x5a}, "bzip2", bunzip2 },
- { {0x5d, 0x00}, "lzma", unlzma },
- { {0xfd, 0x37}, "xz", unxz },
- { {0x89, 0x4c}, "lzo", unlzo },
- { {0x02, 0x21}, "lz4", unlz4 },
- { {0x28, 0xb5}, "zstd", unzstd },
- { {0, 0}, NULL, NULL }
+ { .magic = {0x1f, 0x8b}, .name = "gzip", .decompressor = gunzip },
+ { .magic = {0x1f, 0x9e}, .name = "gzip", .decompressor = gunzip },
+ { .magic = {0x42, 0x5a}, .name = "bzip2", .decompressor = bunzip2 },
+ { .magic = {0x5d, 0x00}, .name = "lzma", .decompressor = unlzma },
+ { .magic = {0xfd, 0x37}, .name = "xz", .decompressor = unxz },
+ { .magic = {0x89, 0x4c}, .name = "lzo", .decompressor = unlzo },
+ { .magic = {0x02, 0x21}, .name = "lz4", .decompressor = unlz4 },
+ { .magic = {0x28, 0xb5}, .name = "zstd", .decompressor = unzstd },
+ { /* sentinel */ }
};
decompress_fn __init decompress_method(const unsigned char *inbuf, long len,
@@ -73,11 +73,10 @@ decompress_fn __init decompress_method(const unsigned char *inbuf, long len,
pr_debug("Compressed data magic: %#.2x %#.2x\n", inbuf[0], inbuf[1]);
- for (cf = compressed_formats; cf->name; cf++) {
+ for (cf = compressed_formats; cf->name; cf++)
if (!memcmp(inbuf, cf->magic, 2))
break;
- }
if (name)
*name = cf->name;
return cf->decompressor;
diff --git a/lib/digsig.c b/lib/digsig.c
index 04b5e55ed95f..2b36f9cc91e9 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -159,7 +159,6 @@ static int digsig_verify_rsa(struct key *key,
len = mlen;
head = len - l;
- memset(out1, 0, head);
memcpy(out1 + head, p, l);
kfree(p);
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index b3a85fe8b673..f0c78b5b5324 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -102,7 +102,7 @@ static void __dump_stack(const char *log_lvl)
*/
asmlinkage __visible void dump_stack_lvl(const char *log_lvl)
{
- bool in_panic = this_cpu_in_panic();
+ bool in_panic = panic_on_this_cpu();
unsigned long flags;
/*
diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c
index 77558b6c29ca..75403ec50f49 100644
--- a/lib/fault-inject-usercopy.c
+++ b/lib/fault-inject-usercopy.c
@@ -22,10 +22,8 @@ static int __init fail_usercopy_debugfs(void)
dir = fault_create_debugfs_attr("fail_usercopy", NULL,
&fail_usercopy.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
- return 0;
+ return PTR_ERR_OR_ZERO(dir);
}
late_initcall(fail_usercopy_debugfs);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 4fa5635bf81b..841f29783833 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -899,8 +899,11 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
if (!name)
name = of_node_full_name(np_pool);
}
- if (pdev)
+ if (pdev) {
pool = gen_pool_get(&pdev->dev, name);
+ put_device(&pdev->dev);
+ }
+
of_node_put(np_pool);
return pool;
diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c
index cce12287708e..258fb0e7abdf 100644
--- a/lib/ref_tracker.c
+++ b/lib/ref_tracker.c
@@ -75,7 +75,7 @@ ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
struct ref_tracker *tracker;
stats = kmalloc(struct_size(stats, stacks, limit),
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
if (!stats)
return ERR_PTR(-ENOMEM);
stats->total = 0;
@@ -159,7 +159,7 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
return;
}
- sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN);
+ sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT);
for (i = 0, skipped = stats->total; i < stats->count; ++i) {
stack = stats->stacks[i].stack_handle;
@@ -306,7 +306,7 @@ int ref_tracker_free(struct ref_tracker_dir *dir,
}
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
stack_handle = stack_depot_save(entries, nr_entries,
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
spin_lock_irqsave(&dir->lock, flags);
if (tracker->dead) {
diff --git a/lib/sys_info.c b/lib/sys_info.c
index 5bf503fd7ec1..496f9151c9b6 100644
--- a/lib/sys_info.c
+++ b/lib/sys_info.c
@@ -55,7 +55,7 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
- char names[sizeof(sys_info_avail) + 1];
+ char names[sizeof(sys_info_avail)];
struct ctl_table table;
unsigned long *si_bits_global;
@@ -81,6 +81,7 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write,
char *delim = "";
int i, len = 0;
+ names[0] = '\0';
for (i = 0; i < ARRAY_SIZE(si_names); i++) {
if (*si_bits_global & si_names[i].bit) {
len += scnprintf(names + len, sizeof(names) - len,
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index 211222e63328..be4f93124901 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -26,6 +26,7 @@
#include <linux/kthread.h>
#include <linux/vmalloc.h>
#include <linux/efi_embedded_fw.h>
+#include <linux/string_choices.h>
MODULE_IMPORT_NS("TEST_FIRMWARE");
@@ -304,17 +305,17 @@ static ssize_t config_show(struct device *dev,
"FW_ACTION_NOUEVENT");
len += scnprintf(buf + len, PAGE_SIZE - len,
"into_buf:\t\t%s\n",
- test_fw_config->into_buf ? "true" : "false");
+ str_true_false(test_fw_config->into_buf));
len += scnprintf(buf + len, PAGE_SIZE - len,
"buf_size:\t%zu\n", test_fw_config->buf_size);
len += scnprintf(buf + len, PAGE_SIZE - len,
"file_offset:\t%zu\n", test_fw_config->file_offset);
len += scnprintf(buf + len, PAGE_SIZE - len,
"partial:\t\t%s\n",
- test_fw_config->partial ? "true" : "false");
+ str_true_false(test_fw_config->partial));
len += scnprintf(buf + len, PAGE_SIZE - len,
"sync_direct:\t\t%s\n",
- test_fw_config->sync_direct ? "true" : "false");
+ str_true_false(test_fw_config->sync_direct));
len += scnprintf(buf + len, PAGE_SIZE - len,
"read_fw_idx:\t%u\n", test_fw_config->read_fw_idx);
if (test_fw_config->upload_name)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 128b525b8811..41b6c9386b69 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1031,7 +1031,7 @@ struct backing_dev_info *bdi_alloc(int node_id)
kfree(bdi);
return NULL;
}
- bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
+ bdi->capabilities = BDI_CAP_WRITEBACK;
bdi->ra_pages = VM_READAHEAD_PAGES;
bdi->io_pages = VM_READAHEAD_PAGES;
timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
diff --git a/mm/filemap.c b/mm/filemap.c
index a52dd38d2b4a..13f0259d993c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1621,7 +1621,7 @@ static void filemap_end_dropbehind(struct folio *folio)
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
-static void filemap_end_dropbehind_write(struct folio *folio)
+void folio_end_dropbehind(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
@@ -1638,16 +1638,18 @@ static void filemap_end_dropbehind_write(struct folio *folio)
folio_unlock(folio);
}
}
+EXPORT_SYMBOL_GPL(folio_end_dropbehind);
/**
- * folio_end_writeback - End writeback against a folio.
+ * folio_end_writeback_no_dropbehind - End writeback against a folio.
* @folio: The folio.
*
* The folio must actually be under writeback.
+ * This call is intended for filesystems that need to defer dropbehind.
*
* Context: May be called from process or interrupt context.
*/
-void folio_end_writeback(struct folio *folio)
+void folio_end_writeback_no_dropbehind(struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
@@ -1663,6 +1665,25 @@ void folio_end_writeback(struct folio *folio)
folio_rotate_reclaimable(folio);
}
+ if (__folio_end_writeback(folio))
+ folio_wake_bit(folio, PG_writeback);
+
+ acct_reclaim_writeback(folio);
+}
+EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);
+
+/**
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
+ *
+ * The folio must actually be under writeback.
+ *
+ * Context: May be called from process or interrupt context.
+ */
+void folio_end_writeback(struct folio *folio)
+{
+ VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
+
/*
* Writeback does not hold a folio reference of its own, relying
* on truncation to wait for the clearing of PG_writeback.
@@ -1670,11 +1691,8 @@ void folio_end_writeback(struct folio *folio)
* reused before the folio_wake_bit().
*/
folio_get(folio);
- if (__folio_end_writeback(folio))
- folio_wake_bit(folio, PG_writeback);
-
- filemap_end_dropbehind_write(folio);
- acct_reclaim_writeback(folio);
+ folio_end_writeback_no_dropbehind(folio);
+ folio_end_dropbehind(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f90fd6a7137..757bc4d3b5b5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2990,26 +2990,23 @@ bool __folio_end_writeback(struct folio *folio)
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
__xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
- struct bdi_writeback *wb = inode_to_wb(inode);
- wb_stat_mod(wb, WB_WRITEBACK, -nr);
- __wb_writeout_add(wb, nr);
- if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
- wb_inode_writeback_end(wb);
+ wb = inode_to_wb(inode);
+ wb_stat_mod(wb, WB_WRITEBACK, -nr);
+ __wb_writeout_add(wb, nr);
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ wb_inode_writeback_end(wb);
+ if (mapping->host)
+ sb_clear_inode_writeback(mapping->host);
}
- if (mapping->host && !mapping_tagged(mapping,
- PAGECACHE_TAG_WRITEBACK))
- sb_clear_inode_writeback(mapping->host);
-
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
@@ -3034,7 +3031,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, folio->index);
struct inode *inode = mapping->host;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
unsigned long flags;
bool on_wblist;
@@ -3045,21 +3042,19 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
- struct bdi_writeback *wb = inode_to_wb(inode);
-
- wb_stat_mod(wb, WB_WRITEBACK, nr);
- if (!on_wblist)
- wb_inode_writeback_start(wb);
+ wb = inode_to_wb(inode);
+ wb_stat_mod(wb, WB_WRITEBACK, nr);
+ if (!on_wblist) {
+ wb_inode_writeback_start(wb);
+ /*
+ * We can come through here when swapping anonymous
+ * folios, so we don't necessarily have an inode to
+ * track for sync.
+ */
+ if (mapping->host)
+ sb_mark_inode_writeback(mapping->host);
}
- /*
- * We can come through here when swapping anonymous
- * folios, so we don't necessarily have an inode to
- * track for sync.
- */
- if (mapping->host && !on_wblist)
- sb_mark_inode_writeback(mapping->host);
if (!folio_test_dirty(folio))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 2d8b67dac7b5..a570e7adf270 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -101,6 +101,20 @@ config SUNRPC_DEBUG
If unsure, say Y.
+config SUNRPC_DEBUG_TRACE
+ bool "RPC: Send dfprintk() output to the trace buffer"
+ depends on SUNRPC_DEBUG && TRACING
+ default n
+ help
+ dprintk() output can be voluminous, which can overwhelm the
+ kernel's logging facility as it must be sent to the console.
+ This option causes dprintk() output to go to the trace buffer
+ instead of the kernel log.
+
+ This will cause warnings about trace_printk() being used to be
+ logged at boot time, so say N unless you are debugging a problem
+ with sunrpc-based clients or services.
+
config SUNRPC_XPRT_RDMA
tristate "RPC-over-RDMA transport"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index cb32ab9a8395..7d2cdc2bd374 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -794,12 +794,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
struct gssx_res_accept_sec_context *res = data;
u32 value_follows;
int err;
- struct page *scratch;
+ struct folio *scratch;
- scratch = alloc_page(GFP_KERNEL);
+ scratch = folio_alloc(GFP_KERNEL, 0);
if (!scratch)
return -ENOMEM;
- xdr_set_scratch_page(xdr, scratch);
+ xdr_set_scratch_folio(xdr, scratch);
/* res->status */
err = gssx_dec_status(xdr, &res->status);
@@ -844,6 +844,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
err = gssx_dec_option_array(xdr, &res->options);
out_free:
- __free_page(scratch);
+ folio_put(scratch);
return err;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9b45fbdc90ca..016f16ca5779 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1074,7 +1074,6 @@ int rpc_malloc(struct rpc_task *task)
rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
return 0;
}
-EXPORT_SYMBOL_GPL(rpc_malloc);
/**
* rpc_free - free RPC buffer resources allocated via rpc_malloc
@@ -1095,7 +1094,6 @@ void rpc_free(struct rpc_task *task)
else
kfree(buf);
}
-EXPORT_SYMBOL_GPL(rpc_free);
/*
* Creation and deletion of RPC task structures
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 4e92e2a50168..d8d8842c7de5 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -86,7 +86,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc)
/* ACL likes to be lazy in allocating pages - ACLs
* are small by default but can get huge. */
if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) {
- *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+ *ppage = alloc_page(GFP_NOWAIT);
if (unlikely(*ppage == NULL)) {
if (copied == 0)
return -ENOMEM;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b1fab3a69544..de05ef637bdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx)
if (m->mode == SVC_POOL_PERNODE)
return m->pool_to[pidx];
}
- return NUMA_NO_NODE;
+ return numa_mem_id();
}
/*
* Set the given thread's cpus_allowed mask so that it
@@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net)
svc_unregister(serv, net);
rpcb_put_local(net);
}
-EXPORT_SYMBOL_GPL(svc_rpcb_cleanup);
static int svc_uses_rpcbind(struct svc_serv *serv)
{
@@ -670,8 +669,8 @@ svc_rqst_free(struct svc_rqst *rqstp)
folio_batch_release(&rqstp->rq_fbatch);
kfree(rqstp->rq_bvec);
svc_release_buffer(rqstp);
- if (rqstp->rq_scratch_page)
- put_page(rqstp->rq_scratch_page);
+ if (rqstp->rq_scratch_folio)
+ folio_put(rqstp->rq_scratch_folio);
kfree(rqstp->rq_resp);
kfree(rqstp->rq_argp);
kfree(rqstp->rq_auth_data);
@@ -692,8 +691,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
- rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
- if (!rqstp->rq_scratch_page)
+ rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node);
+ if (!rqstp->rq_scratch_folio)
goto out_enomem;
rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 8b1837228799..049ab53088e9 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1102,6 +1102,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
* svc_xprt_destroy_all - Destroy transports associated with @serv
* @serv: RPC service to be shut down
* @net: target network namespace
+ * @unregister: true if it is OK to unregister the destroyed xprts
*
* Server threads may still be running (especially in the case where the
* service is still running in other network namespaces).
@@ -1114,7 +1115,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
* threads, we may need to wait a little while and then check again to
* see if they're done.
*/
-void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
+void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net,
+ bool unregister)
{
int delay = 0;
@@ -1124,6 +1126,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
svc_clean_up_xprts(serv, net);
msleep(delay++);
}
+
+ if (unregister)
+ svc_rpcb_cleanup(serv, net);
}
EXPORT_SYMBOL_GPL(svc_xprt_destroy_all);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 1478c41c7e9d..3aac1456e23e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -190,7 +190,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
while (len > 0) {
if (!*ppages)
- *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+ *ppages = alloc_page(GFP_NOWAIT);
if (!*ppages)
return -ENOBUFS;
ppages++;
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e722dd6fa8ef..92669904eecc 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2636,6 +2636,11 @@ sub exclude_global_initialisers {
$realfile =~ m@/bpf/.*\.bpf\.c$@;
}
+sub is_userspace {
+ my ($realfile) = @_;
+ return ($realfile =~ m@^tools/@ || $realfile =~ m@^scripts/@);
+}
+
sub process {
my $filename = shift;
@@ -3294,7 +3299,7 @@ sub process {
# file delta changes
$line =~ /^\s*(?:[\w\.\-\+]*\/)++[\w\.\-\+]+:/ ||
# filename then :
- $line =~ /^\s*(?:Fixes:|$link_tags_search|$signature_tags)/i ||
+ $line =~ /^\s*(?:Fixes:|https?:|$link_tags_search|$signature_tags)/i ||
# A Fixes:, link or signature tag line
$commit_log_possible_stack_dump)) {
WARN("COMMIT_LOG_LONG_LINE",
@@ -7018,21 +7023,20 @@ sub process {
# }
# }
# }
-
# strcpy uses that should likely be strscpy
- if ($line =~ /\bstrcpy\s*\(/) {
+ if ($line =~ /\bstrcpy\s*\(/ && !is_userspace($realfile)) {
WARN("STRCPY",
"Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88\n" . $herecurr);
}
# strlcpy uses that should likely be strscpy
- if ($line =~ /\bstrlcpy\s*\(/) {
+ if ($line =~ /\bstrlcpy\s*\(/ && !is_userspace($realfile)) {
WARN("STRLCPY",
"Prefer strscpy over strlcpy - see: https://github.com/KSPP/linux/issues/89\n" . $herecurr);
}
# strncpy uses that should likely be strscpy or strscpy_pad
- if ($line =~ /\bstrncpy\s*\(/) {
+ if ($line =~ /\bstrncpy\s*\(/ && !is_userspace($realfile)) {
WARN("STRNCPY",
"Prefer strscpy, strscpy_pad, or __nonstring over strncpy - see: https://github.com/KSPP/linux/issues/90\n" . $herecurr);
}
diff --git a/scripts/coccinelle/api/platform_no_drv_owner.cocci b/scripts/coccinelle/api/platform_no_drv_owner.cocci
index 8fa050eeb7e5..5e869858bda8 100644
--- a/scripts/coccinelle/api/platform_no_drv_owner.cocci
+++ b/scripts/coccinelle/api/platform_no_drv_owner.cocci
@@ -10,12 +10,21 @@ virtual org
virtual report
@match1@
+declarer name builtin_i2c_driver;
+declarer name builtin_platform_driver;
+declarer name builtin_platform_driver_probe;
declarer name module_i2c_driver;
declarer name module_platform_driver;
declarer name module_platform_driver_probe;
identifier __driver;
@@
(
+ builtin_i2c_driver(__driver);
+|
+ builtin_platform_driver(__driver);
+|
+ builtin_platform_driver_probe(__driver, ...);
+|
module_i2c_driver(__driver);
|
module_platform_driver(__driver);
diff --git a/scripts/coccinelle/misc/of_table.cocci b/scripts/coccinelle/misc/of_table.cocci
index 4693ea744753..17881cb0884b 100644
--- a/scripts/coccinelle/misc/of_table.cocci
+++ b/scripts/coccinelle/misc/of_table.cocci
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/// Make sure (of/i2c/platform)_device_id tables are NULL terminated
+/// Make sure (of/i2c/platform/spi)_device_id tables are NULL terminated
//
// Keywords: of_table i2c_table platform_table
// Confidence: Medium
@@ -15,14 +15,14 @@ identifier var, arr;
expression E;
@@
(
-struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
+struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = {
...,
{
.var = E,
* }
};
|
-struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
+struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = {
...,
* { ..., E, ... },
};
@@ -33,7 +33,7 @@ identifier var, arr;
expression E;
@@
(
-struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
+struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = {
...,
{
.var = E,
@@ -42,7 +42,7 @@ struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
+ { }
};
|
-struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
+struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = {
...,
{ ..., E, ... },
+ { },
@@ -55,7 +55,7 @@ identifier var, arr;
expression E;
@@
(
-struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
+struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = {
...,
{
.var = E,
@@ -63,7 +63,7 @@ struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
@p1
};
|
-struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = {
+struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = {
...,
{ ..., E, ... }
@p1
diff --git a/security/security.c b/security/security.c
index 301104d63fde..4d3c03a4524c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1814,7 +1814,7 @@ EXPORT_SYMBOL(security_dentry_init_security);
* Return: Returns 0 on success, error on failure.
*/
int security_dentry_create_files_as(struct dentry *dentry, int mode,
- struct qstr *name,
+ const struct qstr *name,
const struct cred *old, struct cred *new)
{
return call_int_hook(dentry_create_files_as, dentry, mode,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 76b66845a1c3..dfc22da42f30 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2905,7 +2905,7 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode,
}
static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
- struct qstr *name,
+ const struct qstr *name,
const struct cred *old,
struct cred *new)
{
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index fdf2f193a291..af986587841d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4908,7 +4908,7 @@ static int smack_inode_copy_up_xattr(struct dentry *src, const char *name)
}
static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
- struct qstr *name,
+ const struct qstr *name,
const struct cred *old,
struct cred *new)
{
diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c
index 9afb1ffc00ba..72cc500b44b1 100644
--- a/tools/accounting/delaytop.c
+++ b/tools/accounting/delaytop.c
@@ -42,14 +42,13 @@
#include <linux/genetlink.h>
#include <linux/taskstats.h>
#include <linux/cgroupstats.h>
+#include <stddef.h>
-#define PSI_CPU_SOME "/proc/pressure/cpu"
-#define PSI_CPU_FULL "/proc/pressure/cpu"
-#define PSI_MEMORY_SOME "/proc/pressure/memory"
-#define PSI_MEMORY_FULL "/proc/pressure/memory"
-#define PSI_IO_SOME "/proc/pressure/io"
-#define PSI_IO_FULL "/proc/pressure/io"
-#define PSI_IRQ_FULL "/proc/pressure/irq"
+#define PSI_PATH "/proc/pressure"
+#define PSI_CPU_PATH "/proc/pressure/cpu"
+#define PSI_MEMORY_PATH "/proc/pressure/memory"
+#define PSI_IO_PATH "/proc/pressure/io"
+#define PSI_IRQ_PATH "/proc/pressure/irq"
#define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN))
@@ -61,24 +60,28 @@
#define TASK_COMM_LEN 16
#define MAX_MSG_SIZE 1024
#define MAX_TASKS 1000
+#define MAX_BUF_LEN 256
#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
#define BOOL_FPRINT(stream, fmt, ...) \
({ \
int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
ret >= 0; \
})
+#define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count)
#define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
-
-/* Program settings structure */
-struct config {
- int delay; /* Update interval in seconds */
- int iterations; /* Number of iterations, 0 == infinite */
- int max_processes; /* Maximum number of processes to show */
- char sort_field; /* Field to sort by */
- int output_one_time; /* Output once and exit */
- int monitor_pid; /* Monitor specific PID */
- char *container_path; /* Path to container cgroup */
-};
+#define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n"
+#define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n"
+#define SORT_FIELD(name, cmd, modes) \
+ {#name, #cmd, \
+ offsetof(struct task_info, name##_delay_total), \
+ offsetof(struct task_info, name##_count), \
+ modes}
+#define END_FIELD {NULL, 0, 0}
+
+/* Display mode types */
+#define MODE_TYPE_ALL (0xFFFFFFFF)
+#define MODE_DEFAULT (1 << 0)
+#define MODE_MEMVERBOSE (1 << 1)
/* PSI statistics structure */
struct psi_stats {
@@ -119,6 +122,8 @@ struct task_info {
unsigned long long wpcopy_delay_total;
unsigned long long irq_count;
unsigned long long irq_delay_total;
+ unsigned long long mem_count;
+ unsigned long long mem_delay_total;
};
/* Container statistics structure */
@@ -130,6 +135,27 @@ struct container_stats {
int nr_io_wait; /* Number of processes in IO wait */
};
+/* Delay field structure */
+struct field_desc {
+ const char *name; /* Field name for cmdline argument */
+ const char *cmd_char; /* Interactive command */
+ unsigned long total_offset; /* Offset of total delay in task_info */
+ unsigned long count_offset; /* Offset of count in task_info */
+ size_t supported_modes; /* Supported display modes */
+};
+
+/* Program settings structure */
+struct config {
+ int delay; /* Update interval in seconds */
+ int iterations; /* Number of iterations, 0 == infinite */
+ int max_processes; /* Maximum number of processes to show */
+ int output_one_time; /* Output once and exit */
+ int monitor_pid; /* Monitor specific PID */
+ char *container_path; /* Path to container cgroup */
+ const struct field_desc *sort_field; /* Current sort field */
+ size_t display_mode; /* Current display mode */
+};
+
/* Global variables */
static struct config cfg;
static struct psi_stats psi;
@@ -137,6 +163,19 @@ static struct task_info tasks[MAX_TASKS];
static int task_count;
static int running = 1;
static struct container_stats container_stats;
+static const struct field_desc sort_fields[] = {
+ SORT_FIELD(cpu, c, MODE_DEFAULT),
+ SORT_FIELD(blkio, i, MODE_DEFAULT),
+ SORT_FIELD(irq, q, MODE_DEFAULT),
+ SORT_FIELD(mem, m, MODE_DEFAULT | MODE_MEMVERBOSE),
+ SORT_FIELD(swapin, s, MODE_MEMVERBOSE),
+ SORT_FIELD(freepages, r, MODE_MEMVERBOSE),
+ SORT_FIELD(thrashing, t, MODE_MEMVERBOSE),
+ SORT_FIELD(compact, p, MODE_MEMVERBOSE),
+ SORT_FIELD(wpcopy, w, MODE_MEMVERBOSE),
+ END_FIELD
+};
+static int sort_selected;
/* Netlink socket variables */
static int nl_sd = -1;
@@ -158,18 +197,75 @@ static void disable_raw_mode(void)
tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
}
+/* Find field descriptor by command line */
+static const struct field_desc *get_field_by_cmd_char(char ch)
+{
+ const struct field_desc *field;
+
+ for (field = sort_fields; field->name != NULL; field++) {
+ if (field->cmd_char[0] == ch)
+ return field;
+ }
+
+ return NULL;
+}
+
+/* Find field descriptor by name with string comparison */
+static const struct field_desc *get_field_by_name(const char *name)
+{
+ const struct field_desc *field;
+ size_t field_len;
+
+ for (field = sort_fields; field->name != NULL; field++) {
+ field_len = strlen(field->name);
+ if (field_len != strlen(name))
+ continue;
+ if (strncmp(field->name, name, field_len) == 0)
+ return field;
+ }
+
+ return NULL;
+}
+
+/* Find display name for a field descriptor */
+static const char *get_name_by_field(const struct field_desc *field)
+{
+ return field ? field->name : "UNKNOWN";
+}
+
+/* Generate string of available field names */
+static void display_available_fields(size_t mode)
+{
+ const struct field_desc *field;
+ char buf[MAX_BUF_LEN];
+
+ buf[0] = '\0';
+
+ for (field = sort_fields; field->name != NULL; field++) {
+ if (!(field->supported_modes & mode))
+ continue;
+ strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1);
+ strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1);
+ buf[MAX_BUF_LEN - 1] = '\0';
+ }
+
+ fprintf(stderr, "Available fields: %s\n", buf);
+}
+
/* Display usage information and command line options */
static void usage(void)
{
printf("Usage: delaytop [Options]\n"
"Options:\n"
- " -h, --help Show this help message and exit\n"
- " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n"
- " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n"
- " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n"
- " -o, --once Display once and exit\n"
- " -p, --pid=PID Monitor only the specified PID\n"
- " -C, --container=PATH Monitor the container at specified cgroup path\n");
+ " -h, --help Show this help message and exit\n"
+ " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n"
+ " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n"
+ " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n"
+ " -o, --once Display once and exit\n"
+ " -p, --pid=PID Monitor only the specified PID\n"
+ " -C, --container=PATH Monitor the container at specified cgroup path\n"
+ " -s, --sort=FIELD Sort by delay field (default: cpu)\n"
+ " -M, --memverbose Display memory detailed information\n");
exit(0);
}
@@ -177,6 +273,7 @@ static void usage(void)
static void parse_args(int argc, char **argv)
{
int c;
+ const struct field_desc *field;
struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"delay", required_argument, 0, 'd'},
@@ -184,7 +281,9 @@ static void parse_args(int argc, char **argv)
{"pid", required_argument, 0, 'p'},
{"once", no_argument, 0, 'o'},
{"processes", required_argument, 0, 'P'},
+ {"sort", required_argument, 0, 's'},
{"container", required_argument, 0, 'C'},
+ {"memverbose", no_argument, 0, 'M'},
{0, 0, 0, 0}
};
@@ -192,15 +291,16 @@ static void parse_args(int argc, char **argv)
cfg.delay = 2;
cfg.iterations = 0;
cfg.max_processes = 20;
- cfg.sort_field = 'c'; /* Default sort by CPU delay */
+ cfg.sort_field = &sort_fields[0]; /* Default sorted by CPU delay */
cfg.output_one_time = 0;
cfg.monitor_pid = 0; /* 0 means monitor all PIDs */
cfg.container_path = NULL;
+ cfg.display_mode = MODE_DEFAULT;
while (1) {
int option_index = 0;
- c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index);
+ c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index);
if (c == -1)
break;
@@ -247,6 +347,26 @@ static void parse_args(int argc, char **argv)
case 'C':
cfg.container_path = strdup(optarg);
break;
+ case 's':
+ if (strlen(optarg) == 0) {
+ fprintf(stderr, "Error: empty sort field\n");
+ exit(1);
+ }
+
+ field = get_field_by_name(optarg);
+ /* Show available fields if invalid option provided */
+ if (!field) {
+ fprintf(stderr, "Error: invalid sort field '%s'\n", optarg);
+ display_available_fields(MODE_TYPE_ALL);
+ exit(1);
+ }
+
+ cfg.sort_field = field;
+ break;
+ case 'M':
+ cfg.display_mode = MODE_MEMVERBOSE;
+ cfg.sort_field = get_field_by_name("mem");
+ break;
default:
fprintf(stderr, "Try 'delaytop --help' for more information.\n");
exit(1);
@@ -254,6 +374,25 @@ static void parse_args(int argc, char **argv)
}
}
+/* Calculate average delay in milliseconds for overall memory */
+static void set_mem_delay_total(struct task_info *t)
+{
+ t->mem_delay_total = t->swapin_delay_total +
+ t->freepages_delay_total +
+ t->thrashing_delay_total +
+ t->compact_delay_total +
+ t->wpcopy_delay_total;
+}
+
+static void set_mem_count(struct task_info *t)
+{
+ t->mem_count = t->swapin_count +
+ t->freepages_count +
+ t->thrashing_count +
+ t->compact_count +
+ t->wpcopy_count;
+}
+
/* Create a raw netlink socket and bind */
static int create_nl_socket(void)
{
@@ -358,87 +497,134 @@ static int get_family_id(int sd)
return id;
}
-static void read_psi_stats(void)
+static int read_psi_stats(void)
{
FILE *fp;
char line[256];
int ret = 0;
+ int error_count = 0;
+
+ /* Check if PSI path exists */
+ if (access(PSI_PATH, F_OK) != 0) {
+ fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH);
+ fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n");
+ return -1;
+ }
+
/* Zero all fields */
memset(&psi, 0, sizeof(psi));
+
/* CPU pressure */
- fp = fopen(PSI_CPU_SOME, "r");
+ fp = fopen(PSI_CPU_PATH, "r");
if (fp) {
while (fgets(line, sizeof(line), fp)) {
if (strncmp(line, "some", 4) == 0) {
ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
&psi.cpu_some_avg10, &psi.cpu_some_avg60,
&psi.cpu_some_avg300, &psi.cpu_some_total);
- if (ret != 4)
+ if (ret != 4) {
fprintf(stderr, "Failed to parse CPU some PSI data\n");
+ error_count++;
+ }
} else if (strncmp(line, "full", 4) == 0) {
ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
&psi.cpu_full_avg10, &psi.cpu_full_avg60,
&psi.cpu_full_avg300, &psi.cpu_full_total);
- if (ret != 4)
+ if (ret != 4) {
fprintf(stderr, "Failed to parse CPU full PSI data\n");
+ error_count++;
+ }
}
}
fclose(fp);
+ } else {
+ fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH);
+ error_count++;
}
+
/* Memory pressure */
- fp = fopen(PSI_MEMORY_SOME, "r");
+ fp = fopen(PSI_MEMORY_PATH, "r");
if (fp) {
while (fgets(line, sizeof(line), fp)) {
if (strncmp(line, "some", 4) == 0) {
ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
&psi.memory_some_avg10, &psi.memory_some_avg60,
&psi.memory_some_avg300, &psi.memory_some_total);
- if (ret != 4)
+ if (ret != 4) {
fprintf(stderr, "Failed to parse Memory some PSI data\n");
+ error_count++;
+ }
} else if (strncmp(line, "full", 4) == 0) {
ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
&psi.memory_full_avg10, &psi.memory_full_avg60,
&psi.memory_full_avg300, &psi.memory_full_total);
- }
- if (ret != 4)
+ if (ret != 4) {
fprintf(stderr, "Failed to parse Memory full PSI data\n");
+ error_count++;
+ }
+ }
}
fclose(fp);
+ } else {
+ fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH);
+ error_count++;
}
+
/* IO pressure */
- fp = fopen(PSI_IO_SOME, "r");
+ fp = fopen(PSI_IO_PATH, "r");
if (fp) {
while (fgets(line, sizeof(line), fp)) {
if (strncmp(line, "some", 4) == 0) {
ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
&psi.io_some_avg10, &psi.io_some_avg60,
&psi.io_some_avg300, &psi.io_some_total);
- if (ret != 4)
+ if (ret != 4) {
fprintf(stderr, "Failed to parse IO some PSI data\n");
+ error_count++;
+ }
} else if (strncmp(line, "full", 4) == 0) {
ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
&psi.io_full_avg10, &psi.io_full_avg60,
&psi.io_full_avg300, &psi.io_full_total);
- if (ret != 4)
+ if (ret != 4) {
fprintf(stderr, "Failed to parse IO full PSI data\n");
+ error_count++;
+ }
}
}
fclose(fp);
+ } else {
+ fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH);
+ error_count++;
}
+
/* IRQ pressure (only full) */
- fp = fopen(PSI_IRQ_FULL, "r");
+ fp = fopen(PSI_IRQ_PATH, "r");
if (fp) {
while (fgets(line, sizeof(line), fp)) {
if (strncmp(line, "full", 4) == 0) {
ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
&psi.irq_full_avg10, &psi.irq_full_avg60,
&psi.irq_full_avg300, &psi.irq_full_total);
- if (ret != 4)
+ if (ret != 4) {
fprintf(stderr, "Failed to parse IRQ full PSI data\n");
+ error_count++;
+ }
}
}
fclose(fp);
+ } else {
+ fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH);
+ error_count++;
+ }
+
+ /* Return error count: 0 means success, >0 means warnings, -1 means fatal error */
+ if (error_count > 0) {
+ fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count);
+ return error_count;
}
+
+ return 0;
}
static int read_comm(int pid, char *comm_buf, size_t buf_size)
@@ -527,6 +713,8 @@ static void fetch_and_fill_task_info(int pid, const char *comm)
SET_TASK_STAT(task_count, wpcopy_delay_total);
SET_TASK_STAT(task_count, irq_count);
SET_TASK_STAT(task_count, irq_delay_total);
+ set_mem_count(&tasks[task_count]);
+ set_mem_delay_total(&tasks[task_count]);
task_count++;
}
break;
@@ -587,19 +775,23 @@ static int compare_tasks(const void *a, const void *b)
{
const struct task_info *t1 = (const struct task_info *)a;
const struct task_info *t2 = (const struct task_info *)b;
+ unsigned long long total1;
+ unsigned long long total2;
+ unsigned long count1;
+ unsigned long count2;
double avg1, avg2;
- switch (cfg.sort_field) {
- case 'c': /* CPU */
- avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count);
- avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count);
- if (avg1 != avg2)
- return avg2 > avg1 ? 1 : -1;
- return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
+ total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset);
+ total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset);
+ count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset);
+ count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset);
- default:
- return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
- }
+ avg1 = average_ms(total1, count1);
+ avg2 = average_ms(total2, count2);
+ if (avg1 != avg2)
+ return avg2 > avg1 ? 1 : -1;
+
+ return 0;
}
/* Sort tasks by selected field */
@@ -673,7 +865,7 @@ static void get_container_stats(void)
}
/* Display results to stdout or log file */
-static void display_results(void)
+static void display_results(int psi_ret)
{
time_t now = time(NULL);
struct tm *tm_now = localtime(&now);
@@ -686,49 +878,53 @@ static void display_results(void)
suc &= BOOL_FPRINT(out, "\033[H\033[J");
/* PSI output (one-line, no cat style) */
- suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n");
- suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
- "CPU some:",
- psi.cpu_some_avg10,
- psi.cpu_some_avg60,
- psi.cpu_some_avg300,
- psi.cpu_some_total / 1000);
- suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
- "CPU full:",
- psi.cpu_full_avg10,
- psi.cpu_full_avg60,
- psi.cpu_full_avg300,
- psi.cpu_full_total / 1000);
- suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
- "Memory full:",
- psi.memory_full_avg10,
- psi.memory_full_avg60,
- psi.memory_full_avg300,
- psi.memory_full_total / 1000);
- suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
- "Memory some:",
- psi.memory_some_avg10,
- psi.memory_some_avg60,
- psi.memory_some_avg300,
- psi.memory_some_total / 1000);
- suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
- "IO full:",
- psi.io_full_avg10,
- psi.io_full_avg60,
- psi.io_full_avg300,
- psi.io_full_total / 1000);
- suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
- "IO some:",
- psi.io_some_avg10,
- psi.io_some_avg60,
- psi.io_some_avg300,
- psi.io_some_total / 1000);
- suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
- "IRQ full:",
- psi.irq_full_avg10,
- psi.irq_full_avg60,
- psi.irq_full_avg300,
- psi.irq_full_total / 1000);
+ suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n");
+ if (psi_ret) {
+ suc &= BOOL_FPRINT(out, " PSI not found: check if psi=1 enabled in cmdline\n");
+ } else {
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "CPU some:",
+ psi.cpu_some_avg10,
+ psi.cpu_some_avg60,
+ psi.cpu_some_avg300,
+ psi.cpu_some_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "CPU full:",
+ psi.cpu_full_avg10,
+ psi.cpu_full_avg60,
+ psi.cpu_full_avg300,
+ psi.cpu_full_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "Memory full:",
+ psi.memory_full_avg10,
+ psi.memory_full_avg60,
+ psi.memory_full_avg300,
+ psi.memory_full_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "Memory some:",
+ psi.memory_some_avg10,
+ psi.memory_some_avg60,
+ psi.memory_some_avg300,
+ psi.memory_some_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "IO full:",
+ psi.io_full_avg10,
+ psi.io_full_avg60,
+ psi.io_full_avg300,
+ psi.io_full_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "IO some:",
+ psi.io_some_avg10,
+ psi.io_some_avg60,
+ psi.io_some_avg300,
+ psi.io_some_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "IRQ full:",
+ psi.irq_full_avg10,
+ psi.irq_full_avg60,
+ psi.irq_full_avg300,
+ psi.irq_full_total / 1000);
+ }
if (cfg.container_path) {
suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
@@ -738,29 +934,59 @@ static void display_results(void)
container_stats.nr_stopped, container_stats.nr_uninterruptible,
container_stats.nr_io_wait);
}
- suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n",
- cfg.max_processes);
- suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND");
- suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n",
- "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)",
- "THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)");
- suc &= BOOL_FPRINT(out, "-----------------------------------------------");
- suc &= BOOL_FPRINT(out, "----------------------------------------------\n");
+ /* Interacive command */
+ suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n");
+ if (sort_selected) {
+ if (cfg.display_mode == MODE_MEMVERBOSE)
+ suc &= BOOL_FPRINT(out,
+ "sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n");
+ else
+ suc &= BOOL_FPRINT(out,
+ "sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n");
+ }
+
+ /* Task delay output */
+ suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n",
+ cfg.max_processes, get_name_by_field(cfg.sort_field));
+
+ suc &= BOOL_FPRINT(out, "%8s %8s %-17s", "PID", "TGID", "COMMAND");
+ if (cfg.display_mode == MODE_MEMVERBOSE) {
+ suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n",
+ "MEM(ms)", "SWAP(ms)", "RCL(ms)",
+ "THR(ms)", "CMP(ms)", "WP(ms)");
+ suc &= BOOL_FPRINT(out, "-----------------------");
+ suc &= BOOL_FPRINT(out, "-----------------------");
+ suc &= BOOL_FPRINT(out, "-----------------------");
+ suc &= BOOL_FPRINT(out, "---------------------\n");
+ } else {
+ suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n",
+ "CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)");
+ suc &= BOOL_FPRINT(out, "-----------------------");
+ suc &= BOOL_FPRINT(out, "-----------------------");
+ suc &= BOOL_FPRINT(out, "--------------------------\n");
+ }
+
count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
for (i = 0; i < count; i++) {
- suc &= BOOL_FPRINT(out, "%5d %5d %-15s",
+ suc &= BOOL_FPRINT(out, "%8d %8d %-15s",
tasks[i].pid, tasks[i].tgid, tasks[i].command);
- suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n",
- average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count),
- average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count),
- average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count),
- average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count),
- average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count),
- average_ms(tasks[i].compact_delay_total, tasks[i].compact_count),
- average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count),
- average_ms(tasks[i].irq_delay_total, tasks[i].irq_count));
+ if (cfg.display_mode == MODE_MEMVERBOSE) {
+ suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE,
+ TASK_AVG(tasks[i], mem),
+ TASK_AVG(tasks[i], swapin),
+ TASK_AVG(tasks[i], freepages),
+ TASK_AVG(tasks[i], thrashing),
+ TASK_AVG(tasks[i], compact),
+ TASK_AVG(tasks[i], wpcopy));
+ } else {
+ suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT,
+ TASK_AVG(tasks[i], cpu),
+ TASK_AVG(tasks[i], blkio),
+ TASK_AVG(tasks[i], irq),
+ TASK_AVG(tasks[i], mem));
+ }
}
suc &= BOOL_FPRINT(out, "\n");
@@ -769,11 +995,79 @@ static void display_results(void)
perror("Error writing to output");
}
+/* Check for keyboard input with timeout based on cfg.delay */
+static char check_for_keypress(void)
+{
+ struct timeval tv = {cfg.delay, 0};
+ fd_set readfds;
+ char ch = 0;
+
+ FD_ZERO(&readfds);
+ FD_SET(STDIN_FILENO, &readfds);
+ int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv);
+
+ if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
+ read(STDIN_FILENO, &ch, 1);
+ return ch;
+ }
+
+ return 0;
+}
+
+#define MAX_MODE_SIZE 2
+static void toggle_display_mode(void)
+{
+ static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE};
+ static size_t cur_index;
+
+ cur_index = (cur_index + 1) % MAX_MODE_SIZE;
+ cfg.display_mode = modes[cur_index];
+}
+
+/* Handle keyboard input: sorting selection, mode toggle, or quit */
+static void handle_keypress(char ch, int *running)
+{
+ const struct field_desc *field;
+
+ /* Change sort field */
+ if (sort_selected) {
+ field = get_field_by_cmd_char(ch);
+ if (field && (field->supported_modes & cfg.display_mode))
+ cfg.sort_field = field;
+
+ sort_selected = 0;
+ /* Handle mode changes or quit */
+ } else {
+ switch (ch) {
+ case 'o':
+ sort_selected = 1;
+ break;
+ case 'M':
+ toggle_display_mode();
+ for (field = sort_fields; field->name != NULL; field++) {
+ if (field->supported_modes & cfg.display_mode) {
+ cfg.sort_field = field;
+ break;
+ }
+ }
+ break;
+ case 'q':
+ case 'Q':
+ *running = 0;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
/* Main function */
int main(int argc, char **argv)
{
+ const struct field_desc *field;
int iterations = 0;
- int use_q_quit = 0;
+ int psi_ret = 0;
+ char keypress;
/* Parse command line arguments */
parse_args(argc, argv);
@@ -793,17 +1087,24 @@ int main(int argc, char **argv)
exit(1);
}
- if (!cfg.output_one_time) {
- use_q_quit = 1;
- enable_raw_mode();
- printf("Press 'q' to quit.\n");
- fflush(stdout);
- }
+ /* Set terminal to non-canonical mode for interaction */
+ enable_raw_mode();
/* Main loop */
while (running) {
+ /* Auto-switch sort field when not matching display mode */
+ if (!(cfg.sort_field->supported_modes & cfg.display_mode)) {
+ for (field = sort_fields; field->name != NULL; field++) {
+ if (field->supported_modes & cfg.display_mode) {
+ cfg.sort_field = field;
+ printf("Auto-switched sort field to: %s\n", field->name);
+ break;
+ }
+ }
+ }
+
/* Read PSI statistics */
- read_psi_stats();
+ psi_ret = read_psi_stats();
/* Get container stats if container path provided */
if (cfg.container_path)
@@ -816,7 +1117,7 @@ int main(int argc, char **argv)
sort_tasks();
/* Display results to stdout or log file */
- display_results();
+ display_results(psi_ret);
/* Check for iterations */
if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
@@ -826,32 +1127,14 @@ int main(int argc, char **argv)
if (cfg.output_one_time)
break;
- /* Check for 'q' key to quit */
- if (use_q_quit) {
- struct timeval tv = {cfg.delay, 0};
- fd_set readfds;
-
- FD_ZERO(&readfds);
- FD_SET(STDIN_FILENO, &readfds);
- int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv);
-
- if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
- char ch = 0;
-
- read(STDIN_FILENO, &ch, 1);
- if (ch == 'q' || ch == 'Q') {
- running = 0;
- break;
- }
- }
- } else {
- sleep(cfg.delay);
- }
+ /* Keypress for interactive usage */
+ keypress = check_for_keypress();
+ if (keypress)
+ handle_keypress(keypress, &running);
}
/* Restore terminal mode */
- if (use_q_quit)
- disable_raw_mode();
+ disable_raw_mode();
/* Cleanup */
close(nl_sd);
diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c
index 84b8c3c92c79..2f830ff8396c 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -499,19 +499,17 @@ void ida_check_random(void)
goto repeat;
}
-void ida_simple_get_remove_test(void)
+void ida_alloc_free_test(void)
{
DEFINE_IDA(ida);
unsigned long i;
- for (i = 0; i < 10000; i++) {
- assert(ida_simple_get(&ida, 0, 20000, GFP_KERNEL) == i);
- }
- assert(ida_simple_get(&ida, 5, 30, GFP_KERNEL) < 0);
+ for (i = 0; i < 10000; i++)
+ assert(ida_alloc_max(&ida, 20000, GFP_KERNEL) == i);
+ assert(ida_alloc_range(&ida, 5, 30, GFP_KERNEL) < 0);
- for (i = 0; i < 10000; i++) {
- ida_simple_remove(&ida, i);
- }
+ for (i = 0; i < 10000; i++)
+ ida_free(&ida, i);
assert(ida_is_empty(&ida));
ida_destroy(&ida);
@@ -524,7 +522,7 @@ void user_ida_checks(void)
ida_check_nomem();
ida_check_conv_user();
ida_check_random();
- ida_simple_get_remove_test();
+ ida_alloc_free_test();
radix_tree_cpu_dead(1);
}
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index a2d8e1093b00..36d03860d9d8 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -36,6 +36,7 @@ TARGETS += filesystems/fat
TARGETS += filesystems/overlayfs
TARGETS += filesystems/statmount
TARGETS += filesystems/mount-notify
+TARGETS += filesystems/fuse
TARGETS += firmware
TARGETS += fpu
TARGETS += ftrace
diff --git a/tools/testing/selftests/filesystems/fuse/.gitignore b/tools/testing/selftests/filesystems/fuse/.gitignore
new file mode 100644
index 000000000000..3e72e742d08e
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+fuse_mnt
+fusectl_test
diff --git a/tools/testing/selftests/filesystems/fuse/Makefile b/tools/testing/selftests/filesystems/fuse/Makefile
new file mode 100644
index 000000000000..612aad69a93a
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES)
+
+TEST_GEN_PROGS := fusectl_test
+TEST_GEN_FILES := fuse_mnt
+
+include ../../lib.mk
+
+VAR_CFLAGS := $(shell pkg-config fuse --cflags 2>/dev/null)
+ifeq ($(VAR_CFLAGS),)
+VAR_CFLAGS := -D_FILE_OFFSET_BITS=64 -I/usr/include/fuse
+endif
+
+VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null)
+ifeq ($(VAR_LDLIBS),)
+VAR_LDLIBS := -lfuse -pthread
+endif
+
+$(OUTPUT)/fuse_mnt: CFLAGS += $(VAR_CFLAGS)
+$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS)
diff --git a/tools/testing/selftests/filesystems/fuse/fuse_mnt.c b/tools/testing/selftests/filesystems/fuse/fuse_mnt.c
new file mode 100644
index 000000000000..d12b17f30fad
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/fuse_mnt.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fusectl test file-system
+ * Creates a simple FUSE filesystem with a single read-write file (/test)
+ */
+
+#define FUSE_USE_VERSION 26
+
+#include <fuse.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static char *content;
+static size_t content_size = 0;
+static const char test_path[] = "/test";
+
+static int test_getattr(const char *path, struct stat *st)
+{
+ memset(st, 0, sizeof(*st));
+
+ if (!strcmp(path, "/")) {
+ st->st_mode = S_IFDIR | 0755;
+ st->st_nlink = 2;
+ return 0;
+ }
+
+ if (!strcmp(path, test_path)) {
+ st->st_mode = S_IFREG | 0664;
+ st->st_nlink = 1;
+ st->st_size = content_size;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int test_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
+ off_t offset, struct fuse_file_info *fi)
+{
+ if (strcmp(path, "/"))
+ return -ENOENT;
+
+ filler(buf, ".", NULL, 0);
+ filler(buf, "..", NULL, 0);
+ filler(buf, test_path + 1, NULL, 0);
+
+ return 0;
+}
+
+static int test_open(const char *path, struct fuse_file_info *fi)
+{
+ if (strcmp(path, test_path))
+ return -ENOENT;
+
+ return 0;
+}
+
+static int test_read(const char *path, char *buf, size_t size, off_t offset,
+ struct fuse_file_info *fi)
+{
+ if (strcmp(path, test_path) != 0)
+ return -ENOENT;
+
+ if (!content || content_size == 0)
+ return 0;
+
+ if (offset >= content_size)
+ return 0;
+
+ if (offset + size > content_size)
+ size = content_size - offset;
+
+ memcpy(buf, content + offset, size);
+
+ return size;
+}
+
+static int test_write(const char *path, const char *buf, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ size_t new_size;
+
+ if (strcmp(path, test_path) != 0)
+ return -ENOENT;
+
+ if(offset > content_size)
+ return -EINVAL;
+
+ new_size = MAX(offset + size, content_size);
+
+ if (new_size > content_size)
+ content = realloc(content, new_size);
+
+ content_size = new_size;
+
+ if (!content)
+ return -ENOMEM;
+
+ memcpy(content + offset, buf, size);
+
+ return size;
+}
+
+static int test_truncate(const char *path, off_t size)
+{
+ if (strcmp(path, test_path) != 0)
+ return -ENOENT;
+
+ if (size == 0) {
+ free(content);
+ content = NULL;
+ content_size = 0;
+ return 0;
+ }
+
+ content = realloc(content, size);
+
+ if (!content)
+ return -ENOMEM;
+
+ if (size > content_size)
+ memset(content + content_size, 0, size - content_size);
+
+ content_size = size;
+ return 0;
+}
+
+static struct fuse_operations memfd_ops = {
+ .getattr = test_getattr,
+ .readdir = test_readdir,
+ .open = test_open,
+ .read = test_read,
+ .write = test_write,
+ .truncate = test_truncate,
+};
+
+int main(int argc, char *argv[])
+{
+ return fuse_main(argc, argv, &memfd_ops, NULL);
+}
diff --git a/tools/testing/selftests/filesystems/fuse/fusectl_test.c b/tools/testing/selftests/filesystems/fuse/fusectl_test.c
new file mode 100644
index 000000000000..8d124d1cacb2
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/fusectl_test.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2025 Chen Linxuan <chenlinxuan@uniontech.com>
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sched.h>
+#include <linux/limits.h>
+
+#include "../../kselftest_harness.h"
+
+#define FUSECTL_MOUNTPOINT "/sys/fs/fuse/connections"
+#define FUSE_MOUNTPOINT "/tmp/fuse_mnt_XXXXXX"
+#define FUSE_DEVICE "/dev/fuse"
+#define FUSECTL_TEST_VALUE "1"
+
+static void write_file(struct __test_metadata *const _metadata,
+ const char *path, const char *val)
+{
+ int fd = open(path, O_WRONLY);
+ size_t len = strlen(val);
+
+ ASSERT_GE(fd, 0);
+ ASSERT_EQ(write(fd, val, len), len);
+ ASSERT_EQ(close(fd), 0);
+}
+
+FIXTURE(fusectl){
+ char fuse_mountpoint[sizeof(FUSE_MOUNTPOINT)];
+ int connection;
+};
+
+FIXTURE_SETUP(fusectl)
+{
+ const char *fuse_mnt_prog = "./fuse_mnt";
+ int status, pid;
+ struct stat statbuf;
+ uid_t uid = getuid();
+ gid_t gid = getgid();
+ char buf[32];
+
+ /* Setup userns */
+ ASSERT_EQ(unshare(CLONE_NEWNS|CLONE_NEWUSER), 0);
+ sprintf(buf, "0 %d 1", uid);
+ write_file(_metadata, "/proc/self/uid_map", buf);
+ write_file(_metadata, "/proc/self/setgroups", "deny");
+ sprintf(buf, "0 %d 1", gid);
+ write_file(_metadata, "/proc/self/gid_map", buf);
+ ASSERT_EQ(mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL), 0);
+
+ strcpy(self->fuse_mountpoint, FUSE_MOUNTPOINT);
+
+ if (!mkdtemp(self->fuse_mountpoint))
+ SKIP(return,
+ "Failed to create FUSE mountpoint %s",
+ strerror(errno));
+
+ if (access(FUSECTL_MOUNTPOINT, F_OK))
+ SKIP(return,
+ "FUSE control filesystem not mounted");
+
+ pid = fork();
+ if (pid < 0)
+ SKIP(return,
+ "Failed to fork FUSE daemon process: %s",
+ strerror(errno));
+
+ if (pid == 0) {
+ execlp(fuse_mnt_prog, fuse_mnt_prog, self->fuse_mountpoint, NULL);
+ exit(errno);
+ }
+
+ waitpid(pid, &status, 0);
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ SKIP(return,
+ "Failed to start FUSE daemon %s",
+ strerror(WEXITSTATUS(status)));
+ }
+
+ if (stat(self->fuse_mountpoint, &statbuf))
+ SKIP(return,
+ "Failed to stat FUSE mountpoint %s",
+ strerror(errno));
+
+ self->connection = statbuf.st_dev;
+}
+
+FIXTURE_TEARDOWN(fusectl)
+{
+ umount2(self->fuse_mountpoint, MNT_DETACH);
+ rmdir(self->fuse_mountpoint);
+}
+
+TEST_F(fusectl, abort)
+{
+ char path_buf[PATH_MAX];
+ int abort_fd, test_fd, ret;
+
+ sprintf(path_buf, "/sys/fs/fuse/connections/%d/abort", self->connection);
+
+ ASSERT_EQ(0, access(path_buf, F_OK));
+
+ abort_fd = open(path_buf, O_WRONLY);
+ ASSERT_GE(abort_fd, 0);
+
+ sprintf(path_buf, "%s/test", self->fuse_mountpoint);
+
+ test_fd = open(path_buf, O_RDWR);
+ ASSERT_GE(test_fd, 0);
+
+ ret = read(test_fd, path_buf, sizeof(path_buf));
+ ASSERT_EQ(ret, 0);
+
+ ret = write(test_fd, "test", sizeof("test"));
+ ASSERT_EQ(ret, sizeof("test"));
+
+ ret = lseek(test_fd, 0, SEEK_SET);
+ ASSERT_GE(ret, 0);
+
+ ret = write(abort_fd, FUSECTL_TEST_VALUE, sizeof(FUSECTL_TEST_VALUE));
+ ASSERT_GT(ret, 0);
+
+ close(abort_fd);
+
+ ret = read(test_fd, path_buf, sizeof(path_buf));
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ENOTCONN);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 6b78a8382d40..9c9735570abf 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -7,6 +7,7 @@
/proc-loadavg-001
/proc-maps-race
/proc-multiple-procfs
+/proc-net-dev-lseek
/proc-empty-vm
/proc-pid-vm
/proc-self-map-files-001
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index be3013515aae..a7de2bb6d8be 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -10,6 +10,7 @@ TEST_GEN_PROGS += fd-003-kthread
TEST_GEN_PROGS += proc-2-is-kthread
TEST_GEN_PROGS += proc-loadavg-001
TEST_GEN_PROGS += proc-maps-race
+TEST_GEN_PROGS += proc-net-dev-lseek
TEST_GEN_PROGS += proc-empty-vm
TEST_GEN_PROGS += proc-pid-vm
TEST_GEN_PROGS += proc-self-map-files-001
diff --git a/tools/testing/selftests/proc/proc-net-dev-lseek.c b/tools/testing/selftests/proc/proc-net-dev-lseek.c
new file mode 100644
index 000000000000..742a3e804451
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-net-dev-lseek.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2025 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <sched.h>
+/*
+ * Test that lseek("/proc/net/dev/", 0, SEEK_SET)
+ * a) works,
+ * b) does what you think it does.
+ */
+int main(void)
+{
+ /* /proc/net/dev output is deterministic in fresh netns only. */
+ if (unshare(CLONE_NEWNET) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ return 4;
+ }
+ return 1;
+ }
+
+ const int fd = open("/proc/net/dev", O_RDONLY);
+ assert(fd >= 0);
+
+ char buf1[4096];
+ const ssize_t rv1 = read(fd, buf1, sizeof(buf1));
+ /*
+ * Not "<=", this file can't be empty:
+ * there is header, "lo" interface with some zeroes.
+ */
+ assert(0 < rv1);
+ assert(rv1 <= sizeof(buf1));
+
+ /* Believe it or not, this line broke one day. */
+ assert(lseek(fd, 0, SEEK_SET) == 0);
+
+ char buf2[4096];
+ const ssize_t rv2 = read(fd, buf2, sizeof(buf2));
+ /* Not "<=", see above. */
+ assert(0 < rv2);
+ assert(rv2 <= sizeof(buf2));
+
+ /* Test that lseek rewinds to the beginning of the file. */
+ assert(rv1 == rv2);
+ assert(memcmp(buf1, buf2, rv1) == 0);
+
+ /* Contents of the file is not validated: this test is about lseek(). */
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c
index d04685771952..978cbcb3eb11 100644
--- a/tools/testing/selftests/proc/proc-pid-vm.c
+++ b/tools/testing/selftests/proc/proc-pid-vm.c
@@ -47,6 +47,10 @@
#include <sys/resource.h>
#include <linux/fs.h>
+#ifndef __maybe_unused
+#define __maybe_unused __attribute__((__unused__))
+#endif
+
#include "../kselftest.h"
static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags)
@@ -218,12 +222,12 @@ static int make_exe(const uint8_t *payload, size_t len)
* 2: vsyscall VMA is r-xp vsyscall=emulate
*/
static volatile int g_vsyscall;
-static const char *str_vsyscall;
+static const char *str_vsyscall __maybe_unused;
-static const char str_vsyscall_0[] = "";
-static const char str_vsyscall_1[] =
+static const char str_vsyscall_0[] __maybe_unused = "";
+static const char str_vsyscall_1[] __maybe_unused =
"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n";
-static const char str_vsyscall_2[] =
+static const char str_vsyscall_2[] __maybe_unused =
"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n";
#ifdef __x86_64__