diff options
213 files changed, 2875 insertions, 1595 deletions
diff --git a/Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.yaml b/Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.yaml index f89ebde76dab..de7c5e59bae1 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.yaml @@ -30,6 +30,7 @@ properties: - socionext,uniphier-ld11-aidet - socionext,uniphier-ld20-aidet - socionext,uniphier-pxs3-aidet + - socionext,uniphier-nx1-aidet reg: maxItems: 1 diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 871d2da7a0a9..87814696a65b 100644 --- a/Documentation/filesystems/ext4/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst @@ -13,8 +13,8 @@ disappeared as of Linux 3.0. There are two places where extended attributes can be found. The first place is between the end of each inode entry and the beginning of the -next inode entry. For example, if inode.i\_extra\_isize = 28 and -sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes +next inode entry. For example, if inode.i_extra_isize = 28 and +sb.inode_size = 256, then there are 256 - (128 + 28) = 100 bytes available for in-inode extended attribute storage. The second place where extended attributes can be found is in the block pointed to by ``inode.i_file_acl``. As of Linux 3.11, it is not possible for this @@ -38,8 +38,8 @@ Extended attributes, when stored after the inode, have a header - Name - Description * - 0x0 - - \_\_le32 - - h\_magic + - __le32 + - h_magic - Magic number for identification, 0xEA020000. This value is set by the Linux driver, though e2fsprogs doesn't seem to check it(?) @@ -55,28 +55,28 @@ The beginning of an extended attribute block is in - Name - Description * - 0x0 - - \_\_le32 - - h\_magic + - __le32 + - h_magic - Magic number for identification, 0xEA020000. * - 0x4 - - \_\_le32 - - h\_refcount + - __le32 + - h_refcount - Reference count. * - 0x8 - - \_\_le32 - - h\_blocks + - __le32 + - h_blocks - Number of disk blocks used. * - 0xC - - \_\_le32 - - h\_hash + - __le32 + - h_hash - Hash value of all attributes. * - 0x10 - - \_\_le32 - - h\_checksum + - __le32 + - h_checksum - Checksum of the extended attribute block. * - 0x14 - - \_\_u32 - - h\_reserved[3] + - __u32 + - h_reserved[3] - Zero. The checksum is calculated against the FS UUID, the 64-bit block number @@ -100,46 +100,46 @@ Attributes stored inside an inode do not need be stored in sorted order. - Name - Description * - 0x0 - - \_\_u8 - - e\_name\_len + - __u8 + - e_name_len - Length of name. * - 0x1 - - \_\_u8 - - e\_name\_index + - __u8 + - e_name_index - Attribute name index. There is a discussion of this below. * - 0x2 - - \_\_le16 - - e\_value\_offs + - __le16 + - e_value_offs - Location of this attribute's value on the disk block where it is stored. Multiple attributes can share the same value. For an inode attribute this value is relative to the start of the first entry; for a block this value is relative to the start of the block (i.e. the header). * - 0x4 - - \_\_le32 - - e\_value\_inum + - __le32 + - e_value_inum - The inode where the value is stored. Zero indicates the value is in the same block as this entry. This field is only used if the - INCOMPAT\_EA\_INODE feature is enabled. + INCOMPAT_EA_INODE feature is enabled. * - 0x8 - - \_\_le32 - - e\_value\_size + - __le32 + - e_value_size - Length of attribute value. * - 0xC - - \_\_le32 - - e\_hash + - __le32 + - e_hash - Hash value of attribute name and attribute value. The kernel doesn't update the hash for in-inode attributes, so for that case this value must be zero, because e2fsck validates any non-zero hash regardless of where the xattr lives. * - 0x10 - char - - e\_name[e\_name\_len] + - e_name[e_name_len] - Attribute name. Does not include trailing NULL. Attribute values can follow the end of the entry table. There appears to be a requirement that they be aligned to 4-byte boundaries. The values are stored starting at the end of the block and grow towards the -xattr\_header/xattr\_entry table. When the two collide, the overflow is +xattr_header/xattr_entry table. When the two collide, the overflow is put into a separate disk block. If the disk block fills up, the filesystem returns -ENOSPC. @@ -167,15 +167,15 @@ the key name. Here is a map of name index values to key prefixes: * - 1 - “user.” * - 2 - - “system.posix\_acl\_access” + - “system.posix_acl_access” * - 3 - - “system.posix\_acl\_default” + - “system.posix_acl_default” * - 4 - “trusted.” * - 6 - “security.” * - 7 - - “system.” (inline\_data only?) + - “system.” (inline_data only?) * - 8 - “system.richacl” (SuSE kernels only?) diff --git a/Documentation/filesystems/ext4/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst index 72075aa608e4..976a180b209c 100644 --- a/Documentation/filesystems/ext4/bigalloc.rst +++ b/Documentation/filesystems/ext4/bigalloc.rst @@ -23,7 +23,7 @@ means that a block group addresses 32 gigabytes instead of 128 megabytes, also shrinking the amount of file system overhead for metadata. The administrator can set a block cluster size at mkfs time (which is -stored in the s\_log\_cluster\_size field in the superblock); from then +stored in the s_log_cluster_size field in the superblock); from then on, the block bitmaps track clusters, not individual blocks. This means that block groups can be several gigabytes in size (instead of just 128MiB); however, the minimum allocation unit becomes a cluster, not a diff --git a/Documentation/filesystems/ext4/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst index c7546dbc197a..91c45d86e9bb 100644 --- a/Documentation/filesystems/ext4/bitmaps.rst +++ b/Documentation/filesystems/ext4/bitmaps.rst @@ -9,15 +9,15 @@ group. The inode bitmap records which entries in the inode table are in use. As with most bitmaps, one bit represents the usage status of one data -block or inode table entry. This implies a block group size of 8 \* -number\_of\_bytes\_in\_a\_logical\_block. +block or inode table entry. This implies a block group size of 8 * +number_of_bytes_in_a_logical_block. NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts of the kernel and e2fsprogs code pretends that the block bitmap contains zeros (i.e. all blocks in the group are free). However, it is not necessarily the case that no blocks are in use -- if ``meta_bg`` is set, the bitmaps and group descriptor live inside the group. Unfortunately, -ext2fs\_test\_block\_bitmap2() will return '0' for those locations, +ext2fs_test_block_bitmap2() will return '0' for those locations, which produces confusing debugfs output. Inode Table diff --git a/Documentation/filesystems/ext4/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst index d5d652addce5..46d78f860623 100644 --- a/Documentation/filesystems/ext4/blockgroup.rst +++ b/Documentation/filesystems/ext4/blockgroup.rst @@ -56,39 +56,39 @@ established that the super block and the group descriptor table, if present, will be at the beginning of the block group. The bitmaps and the inode table can be anywhere, and it is quite possible for the bitmaps to come after the inode table, or for both to be in different -groups (flex\_bg). Leftover space is used for file data blocks, indirect +groups (flex_bg). Leftover space is used for file data blocks, indirect block maps, extent tree blocks, and extended attributes. Flexible Block Groups --------------------- Starting in ext4, there is a new feature called flexible block groups -(flex\_bg). In a flex\_bg, several block groups are tied together as one +(flex_bg). In a flex_bg, several block groups are tied together as one logical block group; the bitmap spaces and the inode table space in the -first block group of the flex\_bg are expanded to include the bitmaps -and inode tables of all other block groups in the flex\_bg. For example, -if the flex\_bg size is 4, then group 0 will contain (in order) the +first block group of the flex_bg are expanded to include the bitmaps +and inode tables of all other block groups in the flex_bg. For example, +if the flex_bg size is 4, then group 0 will contain (in order) the superblock, group descriptors, data block bitmaps for groups 0-3, inode bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining space in group 0 is for file data. The effect of this is to group the block group metadata close together for faster loading, and to enable large files to be continuous on disk. Backup copies of the superblock and group descriptors are always at the beginning of block groups, even -if flex\_bg is enabled. The number of block groups that make up a -flex\_bg is given by 2 ^ ``sb.s_log_groups_per_flex``. +if flex_bg is enabled. The number of block groups that make up a +flex_bg is given by 2 ^ ``sb.s_log_groups_per_flex``. Meta Block Groups ----------------- -Without the option META\_BG, for safety concerns, all block group +Without the option META_BG, for safety concerns, all block group descriptors copies are kept in the first block group. Given the default 128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4 can have at most 2^27/64 = 2^21 block groups. This limits the entire filesystem size to 2^21 * 2^27 = 2^48bytes or 256TiB. The solution to this problem is to use the metablock group feature -(META\_BG), which is already in ext3 for all 2.6 releases. With the -META\_BG feature, ext4 filesystems are partitioned into many metablock +(META_BG), which is already in ext3 for all 2.6 releases. With the +META_BG feature, ext4 filesystems are partitioned into many metablock groups. Each metablock group is a cluster of block groups whose group descriptor structures can be stored in a single disk block. For ext4 filesystems with 4 KB block size, a single metablock group partition @@ -110,7 +110,7 @@ bytes, a meta-block group contains 32 block groups for filesystems with a 1KB block size, and 128 block groups for filesystems with a 4KB blocksize. Filesystems can either be created using this new block group descriptor layout, or existing filesystems can be resized on-line, and -the field s\_first\_meta\_bg in the superblock will indicate the first +the field s_first_meta_bg in the superblock will indicate the first block group using this new layout. Please see an important note about ``BLOCK_UNINIT`` in the section about @@ -121,15 +121,15 @@ Lazy Block Group Initialization A new feature for ext4 are three block group descriptor flags that enable mkfs to skip initializing other parts of the block group -metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean +metadata. Specifically, the INODE_UNINIT and BLOCK_UNINIT flags mean that the inode and block bitmaps for that group can be calculated and therefore the on-disk bitmap blocks are not initialized. This is generally the case for an empty block group or a block group containing -only fixed-location block group metadata. The INODE\_ZEROED flag means +only fixed-location block group metadata. The INODE_ZEROED flag means that the inode table has been initialized; mkfs will unset this flag and rely on the kernel to initialize the inode tables in the background. By not writing zeroes to the bitmaps and inode table, mkfs time is -reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM, -but the dumpe2fs output prints this as “uninit\_bg”. They are the same +reduced considerably. Note the feature flag is RO_COMPAT_GDT_CSUM, +but the dumpe2fs output prints this as “uninit_bg”. They are the same thing. diff --git a/Documentation/filesystems/ext4/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst index 30e25750d88a..2bd990402a5c 100644 --- a/Documentation/filesystems/ext4/blockmap.rst +++ b/Documentation/filesystems/ext4/blockmap.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| i.i\_block Offset | Where It Points | +| i.i_block Offset | Where It Points | +=====================+==============================================================================================================================================================================================================================+ | 0 to 11 | Direct map to file blocks 0 to 11. | +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/Documentation/filesystems/ext4/checksums.rst b/Documentation/filesystems/ext4/checksums.rst index 5519e253810d..e232749daf5f 100644 --- a/Documentation/filesystems/ext4/checksums.rst +++ b/Documentation/filesystems/ext4/checksums.rst @@ -4,7 +4,7 @@ Checksums --------- Starting in early 2012, metadata checksums were added to all major ext4 -and jbd2 data structures. The associated feature flag is metadata\_csum. +and jbd2 data structures. The associated feature flag is metadata_csum. The desired checksum algorithm is indicated in the superblock, though as of October 2012 the only supported algorithm is crc32c. Some data structures did not have space to fit a full 32-bit checksum, so only the @@ -20,7 +20,7 @@ encounters directory blocks that lack sufficient empty space to add a checksum, it will request that you run ``e2fsck -D`` to have the directories rebuilt with checksums. This has the added benefit of removing slack space from the directory files and rebalancing the htree -indexes. If you \_ignore\_ this step, your directories will not be +indexes. If you _ignore_ this step, your directories will not be protected by a checksum! The following table describes the data elements that go into each type @@ -35,39 +35,39 @@ of checksum. The checksum function is whatever the superblock describes - Length - Ingredients * - Superblock - - \_\_le32 + - __le32 - The entire superblock up to the checksum field. The UUID lives inside the superblock. * - MMP - - \_\_le32 + - __le32 - UUID + the entire MMP block up to the checksum field. * - Extended Attributes - - \_\_le32 + - __le32 - UUID + the entire extended attribute block. The checksum field is set to zero. * - Directory Entries - - \_\_le32 + - __le32 - UUID + inode number + inode generation + the directory block up to the fake entry enclosing the checksum field. * - HTREE Nodes - - \_\_le32 + - __le32 - UUID + inode number + inode generation + all valid extents + HTREE tail. The checksum field is set to zero. * - Extents - - \_\_le32 + - __le32 - UUID + inode number + inode generation + the entire extent block up to the checksum field. * - Bitmaps - - \_\_le32 or \_\_le16 + - __le32 or __le16 - UUID + the entire bitmap. Checksums are stored in the group descriptor, and truncated if the group descriptor size is 32 bytes (i.e. ^64bit) * - Inodes - - \_\_le32 + - __le32 - UUID + inode number + inode generation + the entire inode. The checksum field is set to zero. Each inode has its own checksum. * - Group Descriptors - - \_\_le16 - - If metadata\_csum, then UUID + group number + the entire descriptor; - else if gdt\_csum, then crc16(UUID + group number + the entire + - __le16 + - If metadata_csum, then UUID + group number + the entire descriptor; + else if gdt_csum, then crc16(UUID + group number + the entire descriptor). In all cases, only the lower 16 bits are stored. diff --git a/Documentation/filesystems/ext4/directory.rst b/Documentation/filesystems/ext4/directory.rst index 55f618b37144..6eece8e31df8 100644 --- a/Documentation/filesystems/ext4/directory.rst +++ b/Documentation/filesystems/ext4/directory.rst @@ -42,24 +42,24 @@ is at most 263 bytes long, though on disk you'll need to reference - Name - Description * - 0x0 - - \_\_le32 + - __le32 - inode - Number of the inode that this directory entry points to. * - 0x4 - - \_\_le16 - - rec\_len + - __le16 + - rec_len - Length of this directory entry. Must be a multiple of 4. * - 0x6 - - \_\_le16 - - name\_len + - __le16 + - name_len - Length of the file name. * - 0x8 - char - - name[EXT4\_NAME\_LEN] + - name[EXT4_NAME_LEN] - File name. Since file names cannot be longer than 255 bytes, the new directory -entry format shortens the name\_len field and uses the space for a file +entry format shortens the name_len field and uses the space for a file type flag, probably to avoid having to load every inode during directory tree traversal. This format is ``ext4_dir_entry_2``, which is at most 263 bytes long, though on disk you'll need to reference @@ -74,24 +74,24 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most - Name - Description * - 0x0 - - \_\_le32 + - __le32 - inode - Number of the inode that this directory entry points to. * - 0x4 - - \_\_le16 - - rec\_len + - __le16 + - rec_len - Length of this directory entry. * - 0x6 - - \_\_u8 - - name\_len + - __u8 + - name_len - Length of the file name. * - 0x7 - - \_\_u8 - - file\_type + - __u8 + - file_type - File type code, see ftype_ table below. * - 0x8 - char - - name[EXT4\_NAME\_LEN] + - name[EXT4_NAME_LEN] - File name. .. _ftype: @@ -137,19 +137,19 @@ entry uses this extension, it may be up to 271 bytes. - Name - Description * - 0x0 - - \_\_le32 + - __le32 - hash - The hash of the directory name * - 0x4 - - \_\_le32 - - minor\_hash + - __le32 + - minor_hash - The minor hash of the directory name In order to add checksums to these classic directory blocks, a phony ``struct ext4_dir_entry`` is placed at the end of each leaf block to hold the checksum. The directory entry is 12 bytes long. The inode -number and name\_len fields are set to zero to fool old software into +number and name_len fields are set to zero to fool old software into ignoring an apparently empty directory entry, and the checksum is stored in the place where the name normally goes. The structure is ``struct ext4_dir_entry_tail``: @@ -163,24 +163,24 @@ in the place where the name normally goes. The structure is - Name - Description * - 0x0 - - \_\_le32 - - det\_reserved\_zero1 + - __le32 + - det_reserved_zero1 - Inode number, which must be zero. * - 0x4 - - \_\_le16 - - det\_rec\_len + - __le16 + - det_rec_len - Length of this directory entry, which must be 12. * - 0x6 - - \_\_u8 - - det\_reserved\_zero2 + - __u8 + - det_reserved_zero2 - Length of the file name, which must be zero. * - 0x7 - - \_\_u8 - - det\_reserved\_ft + - __u8 + - det_reserved_ft - File type, which must be 0xDE. * - 0x8 - - \_\_le32 - - det\_checksum + - __le32 + - det_checksum - Directory leaf block checksum. The leaf directory block checksum is calculated against the FS UUID, the @@ -194,7 +194,7 @@ Hash Tree Directories A linear array of directory entries isn't great for performance, so a new feature was added to ext3 to provide a faster (but peculiar) balanced tree keyed off a hash of the directory entry name. If the -EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a +EXT4_INDEX_FL (0x1000) flag is set in the inode, this directory uses a hashed btree (htree) to organize and find directory entries. For backwards read-only compatibility with ext2, this tree is actually hidden inside the directory file, masquerading as “empty” directory data @@ -206,14 +206,14 @@ rest of the directory block is empty so that it moves on. The root of the tree always lives in the first data block of the directory. By ext2 custom, the '.' and '..' entries must appear at the beginning of this first block, so they are put here as two -``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of +``struct ext4_dir_entry_2`` s and not stored in the tree. The rest of the root node contains metadata about the tree and finally a hash->block map to find nodes that are lower in the htree. If ``dx_root.info.indirect_levels`` is non-zero then the htree has two levels; the data block pointed to by the root node's map is an interior node, which is indexed by a minor hash. Interior nodes in this tree contains a zeroed out ``struct ext4_dir_entry_2`` followed by a -minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear +minor_hash->block map to find leafe nodes. Leaf nodes contain a linear array of all ``struct ext4_dir_entry_2``; all of these entries (presumably) hash to the same value. If there is an overflow, the entries simply overflow into the next leaf node, and the @@ -245,83 +245,83 @@ of a data block: - Name - Description * - 0x0 - - \_\_le32 + - __le32 - dot.inode - inode number of this directory. * - 0x4 - - \_\_le16 - - dot.rec\_len + - __le16 + - dot.rec_len - Length of this record, 12. * - 0x6 - u8 - - dot.name\_len + - dot.name_len - Length of the name, 1. * - 0x7 - u8 - - dot.file\_type + - dot.file_type - File type of this entry, 0x2 (directory) (if the feature flag is set). * - 0x8 - char - dot.name[4] - - “.\\0\\0\\0” + - “.\0\0\0” * - 0xC - - \_\_le32 + - __le32 - dotdot.inode - inode number of parent directory. * - 0x10 - - \_\_le16 - - dotdot.rec\_len - - block\_size - 12. The record length is long enough to cover all htree + - __le16 + - dotdot.rec_len + - block_size - 12. The record length is long enough to cover all htree data. * - 0x12 - u8 - - dotdot.name\_len + - dotdot.name_len - Length of the name, 2. * - 0x13 - u8 - - dotdot.file\_type + - dotdot.file_type - File type of this entry, 0x2 (directory) (if the feature flag is set). * - 0x14 - char - - dotdot\_name[4] - - “..\\0\\0” + - dotdot_name[4] + - “..\0\0” * - 0x18 - - \_\_le32 - - struct dx\_root\_info.reserved\_zero + - __le32 + - struct dx_root_info.reserved_zero - Zero. * - 0x1C - u8 - - struct dx\_root\_info.hash\_version + - struct dx_root_info.hash_version - Hash type, see dirhash_ table below. * - 0x1D - u8 - - struct dx\_root\_info.info\_length + - struct dx_root_info.info_length - Length of the tree information, 0x8. * - 0x1E - u8 - - struct dx\_root\_info.indirect\_levels - - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR + - struct dx_root_info.indirect_levels + - Depth of the htree. Cannot be larger than 3 if the INCOMPAT_LARGEDIR feature is set; cannot be larger than 2 otherwise. * - 0x1F - u8 - - struct dx\_root\_info.unused\_flags + - struct dx_root_info.unused_flags - * - 0x20 - - \_\_le16 + - __le16 - limit - - Maximum number of dx\_entries that can follow this header, plus 1 for + - Maximum number of dx_entries that can follow this header, plus 1 for the header itself. * - 0x22 - - \_\_le16 + - __le16 - count - - Actual number of dx\_entries that follow this header, plus 1 for the + - Actual number of dx_entries that follow this header, plus 1 for the header itself. * - 0x24 - - \_\_le32 + - __le32 - block - The block number (within the directory file) that goes with hash=0. * - 0x28 - - struct dx\_entry + - struct dx_entry - entries[0] - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. @@ -362,38 +362,38 @@ also the full length of a data block: - Name - Description * - 0x0 - - \_\_le32 + - __le32 - fake.inode - Zero, to make it look like this entry is not in use. * - 0x4 - - \_\_le16 - - fake.rec\_len - - The size of the block, in order to hide all of the dx\_node data. + - __le16 + - fake.rec_len + - The size of the block, in order to hide all of the dx_node data. * - 0x6 - u8 - - name\_len + - name_len - Zero. There is no name for this “unused” directory entry. * - 0x7 - u8 - - file\_type + - file_type - Zero. There is no file type for this “unused” directory entry. * - 0x8 - - \_\_le16 + - __le16 - limit - - Maximum number of dx\_entries that can follow this header, plus 1 for + - Maximum number of dx_entries that can follow this header, plus 1 for the header itself. * - 0xA - - \_\_le16 + - __le16 - count - - Actual number of dx\_entries that follow this header, plus 1 for the + - Actual number of dx_entries that follow this header, plus 1 for the header itself. * - 0xE - - \_\_le32 + - __le32 - block - The block number (within the directory file) that goes with the lowest hash value of this block. This value is stored in the parent block. * - 0x12 - - struct dx\_entry + - struct dx_entry - entries[0] - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. @@ -410,11 +410,11 @@ long: - Name - Description * - 0x0 - - \_\_le32 + - __le32 - hash - Hash code. * - 0x4 - - \_\_le32 + - __le32 - block - Block number (within the directory file, not filesystem blocks) of the next node in the htree. @@ -423,13 +423,13 @@ long: author.) If metadata checksums are enabled, the last 8 bytes of the directory -block (precisely the length of one dx\_entry) are used to store a +block (precisely the length of one dx_entry) are used to store a ``struct dx_tail``, which contains the checksum. The ``limit`` and -``count`` entries in the dx\_root/dx\_node structures are adjusted as -necessary to fit the dx\_tail into the block. If there is no space for -the dx\_tail, the user is notified to run e2fsck -D to rebuild the +``count`` entries in the dx_root/dx_node structures are adjusted as +necessary to fit the dx_tail into the block. If there is no space for +the dx_tail, the user is notified to run e2fsck -D to rebuild the directory index (which will ensure that there's space for the checksum. -The dx\_tail structure is 8 bytes long and looks like this: +The dx_tail structure is 8 bytes long and looks like this: .. list-table:: :widths: 8 8 24 40 @@ -441,13 +441,13 @@ The dx\_tail structure is 8 bytes long and looks like this: - Description * - 0x0 - u32 - - dt\_reserved + - dt_reserved - Zero. * - 0x4 - - \_\_le32 - - dt\_checksum + - __le32 + - dt_checksum - Checksum of the htree directory block. The checksum is calculated against the FS UUID, the htree index header -(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in -use, and the tail block (dx\_tail). +(dx_root or dx_node), all of the htree indices (dx_entry) that are in +use, and the tail block (dx_tail). diff --git a/Documentation/filesystems/ext4/eainode.rst b/Documentation/filesystems/ext4/eainode.rst index ecc0d01a0a72..7a2ef26b064a 100644 --- a/Documentation/filesystems/ext4/eainode.rst +++ b/Documentation/filesystems/ext4/eainode.rst @@ -5,14 +5,14 @@ Large Extended Attribute Values To enable ext4 to store extended attribute values that do not fit in the inode or in the single extended attribute block attached to an inode, -the EA\_INODE feature allows us to store the value in the data blocks of +the EA_INODE feature allows us to store the value in the data blocks of a regular file inode. This “EA inode” is linked only from the extended attribute name index and must not appear in a directory entry. The -inode's i\_atime field is used to store a checksum of the xattr value; -and i\_ctime/i\_version store a 64-bit reference count, which enables +inode's i_atime field is used to store a checksum of the xattr value; +and i_ctime/i_version store a 64-bit reference count, which enables sharing of large xattr values between multiple owning inodes. For backward compatibility with older versions of this feature, the -i\_mtime/i\_generation *may* store a back-reference to the inode number -and i\_generation of the **one** owning inode (in cases where the EA +i_mtime/i_generation *may* store a back-reference to the inode number +and i_generation of the **one** owning inode (in cases where the EA inode is not referenced by multiple inodes) to verify that the EA inode is the correct one being accessed. diff --git a/Documentation/filesystems/ext4/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst index 7ba6114e7f5c..392ec44f8fb0 100644 --- a/Documentation/filesystems/ext4/group_descr.rst +++ b/Documentation/filesystems/ext4/group_descr.rst @@ -7,34 +7,34 @@ Each block group on the filesystem has one of these descriptors associated with it. As noted in the Layout section above, the group descriptors (if present) are the second item in the block group. The standard configuration is for each block group to contain a full copy of -the block group descriptor table unless the sparse\_super feature flag +the block group descriptor table unless the sparse_super feature flag is set. Notice how the group descriptor records the location of both bitmaps and the inode table (i.e. they can float). This means that within a block group, the only data structures with fixed locations are the superblock -and the group descriptor table. The flex\_bg mechanism uses this +and the group descriptor table. The flex_bg mechanism uses this property to group several block groups into a flex group and lay out all of the groups' bitmaps and inode tables into one long run in the first group of the flex group. -If the meta\_bg feature flag is set, then several block groups are -grouped together into a meta group. Note that in the meta\_bg case, +If the meta_bg feature flag is set, then several block groups are +grouped together into a meta group. Note that in the meta_bg case, however, the first and last two block groups within the larger meta group contain only group descriptors for the groups inside the meta group. -flex\_bg and meta\_bg do not appear to be mutually exclusive features. +flex_bg and meta_bg do not appear to be mutually exclusive features. In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the block group descriptor was only 32 bytes long and therefore ends at -bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the +bg_checksum. On an ext4 filesystem with the 64bit feature enabled, the block group descriptor expands to at least the 64 bytes described below; the size is stored in the superblock. -If gdt\_csum is set and metadata\_csum is not set, the block group +If gdt_csum is set and metadata_csum is not set, the block group checksum is the crc16 of the FS UUID, the group number, and the group -descriptor structure. If metadata\_csum is set, then the block group +descriptor structure. If metadata_csum is set, then the block group checksum is the lower 16 bits of the checksum of the FS UUID, the group number, and the group descriptor structure. Both block and inode bitmap checksums are calculated against the FS UUID, the group number, and the @@ -51,59 +51,59 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. - Name - Description * - 0x0 - - \_\_le32 - - bg\_block\_bitmap\_lo + - __le32 + - bg_block_bitmap_lo - Lower 32-bits of location of block bitmap. * - 0x4 - - \_\_le32 - - bg\_inode\_bitmap\_lo + - __le32 + - bg_inode_bitmap_lo - Lower 32-bits of location of inode bitmap. * - 0x8 - - \_\_le32 - - bg\_inode\_table\_lo + - __le32 + - bg_inode_table_lo - Lower 32-bits of location of inode table. * - 0xC - - \_\_le16 - - bg\_free\_blocks\_count\_lo + - __le16 + - bg_free_blocks_count_lo - Lower 16-bits of free block count. * - 0xE - - \_\_le16 - - bg\_free\_inodes\_count\_lo + - __le16 + - bg_free_inodes_count_lo - Lower 16-bits of free inode count. * - 0x10 - - \_\_le16 - - bg\_used\_dirs\_count\_lo + - __le16 + - bg_used_dirs_count_lo - Lower 16-bits of directory count. * - 0x12 - - \_\_le16 - - bg\_flags + - __le16 + - bg_flags - Block group flags. See the bgflags_ table below. * - 0x14 - - \_\_le32 - - bg\_exclude\_bitmap\_lo + - __le32 + - bg_exclude_bitmap_lo - Lower 32-bits of location of snapshot exclusion bitmap. * - 0x18 - - \_\_le16 - - bg\_block\_bitmap\_csum\_lo + - __le16 + - bg_block_bitmap_csum_lo - Lower 16-bits of the block bitmap checksum. * - 0x1A - - \_\_le16 - - bg\_inode\_bitmap\_csum\_lo + - __le16 + - bg_inode_bitmap_csum_lo - Lower 16-bits of the inode bitmap checksum. * - 0x1C - - \_\_le16 - - bg\_itable\_unused\_lo + - __le16 + - bg_itable_unused_lo - Lower 16-bits of unused inode count. If set, we needn't scan past the - ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the + ``(sb.s_inodes_per_group - gdt.bg_itable_unused)`` th entry in the inode table for this group. * - 0x1E - - \_\_le16 - - bg\_checksum - - Group descriptor checksum; crc16(sb\_uuid+group\_num+bg\_desc) if the - RO\_COMPAT\_GDT\_CSUM feature is set, or - crc32c(sb\_uuid+group\_num+bg\_desc) & 0xFFFF if the - RO\_COMPAT\_METADATA\_CSUM feature is set. The bg\_checksum - field in bg\_desc is skipped when calculating crc16 checksum, + - __le16 + - bg_checksum + - Group descriptor checksum; crc16(sb_uuid+group_num+bg_desc) if the + RO_COMPAT_GDT_CSUM feature is set, or + crc32c(sb_uuid+group_num+bg_desc) & 0xFFFF if the + RO_COMPAT_METADATA_CSUM feature is set. The bg_checksum + field in bg_desc is skipped when calculating crc16 checksum, and set to zero if crc32c checksum is used. * - - @@ -111,48 +111,48 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. - These fields only exist if the 64bit feature is enabled and s_desc_size > 32. * - 0x20 - - \_\_le32 - - bg\_block\_bitmap\_hi + - __le32 + - bg_block_bitmap_hi - Upper 32-bits of location of block bitmap. * - 0x24 - - \_\_le32 - - bg\_inode\_bitmap\_hi + - __le32 + - bg_inode_bitmap_hi - Upper 32-bits of location of inodes bitmap. * - 0x28 - - \_\_le32 - - bg\_inode\_table\_hi + - __le32 + - bg_inode_table_hi - Upper 32-bits of location of inodes table. * - 0x2C - - \_\_le16 - - bg\_free\_blocks\_count\_hi + - __le16 + - bg_free_blocks_count_hi - Upper 16-bits of free block count. * - 0x2E - - \_\_le16 - - bg\_free\_inodes\_count\_hi + - __le16 + - bg_free_inodes_count_hi - Upper 16-bits of free inode count. * - 0x30 - - \_\_le16 - - bg\_used\_dirs\_count\_hi + - __le16 + - bg_used_dirs_count_hi - Upper 16-bits of directory count. * - 0x32 - - \_\_le16 - - bg\_itable\_unused\_hi + - __le16 + - bg_itable_unused_hi - Upper 16-bits of unused inode count. * - 0x34 - - \_\_le32 - - bg\_exclude\_bitmap\_hi + - __le32 + - bg_exclude_bitmap_hi - Upper 32-bits of location of snapshot exclusion bitmap. * - 0x38 - - \_\_le16 - - bg\_block\_bitmap\_csum\_hi + - __le16 + - bg_block_bitmap_csum_hi - Upper 16-bits of the block bitmap checksum. * - 0x3A - - \_\_le16 - - bg\_inode\_bitmap\_csum\_hi + - __le16 + - bg_inode_bitmap_csum_hi - Upper 16-bits of the inode bitmap checksum. * - 0x3C - - \_\_u32 - - bg\_reserved + - __u32 + - bg_reserved - Padding to 64 bytes. .. _bgflags: @@ -166,8 +166,8 @@ Block group flags can be any combination of the following: * - Value - Description * - 0x1 - - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT). + - inode table and bitmap are not initialized (EXT4_BG_INODE_UNINIT). * - 0x2 - - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT). + - block bitmap is not initialized (EXT4_BG_BLOCK_UNINIT). * - 0x4 - - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED). + - inode table is zeroed (EXT4_BG_INODE_ZEROED). diff --git a/Documentation/filesystems/ext4/ifork.rst b/Documentation/filesystems/ext4/ifork.rst index b9816d5a896b..dc31f505e6c8 100644 --- a/Documentation/filesystems/ext4/ifork.rst +++ b/Documentation/filesystems/ext4/ifork.rst @@ -1,6 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0 -The Contents of inode.i\_block +The Contents of inode.i_block ------------------------------ Depending on the type of file an inode describes, the 60 bytes of @@ -47,7 +47,7 @@ In ext4, the file to logical block map has been replaced with an extent tree. Under the old scheme, allocating a contiguous run of 1,000 blocks requires an indirect block to map all 1,000 entries; with extents, the mapping is reduced to a single ``struct ext4_extent`` with -``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate +``ee_len = 1000``. If flex_bg is enabled, it is possible to allocate very large files with a single extent, at a considerable reduction in metadata block use, and some improvement in disk efficiency. The inode must have the extents flag (0x80000) flag set for this feature to be in @@ -76,28 +76,28 @@ which is 12 bytes long: - Name - Description * - 0x0 - - \_\_le16 - - eh\_magic + - __le16 + - eh_magic - Magic number, 0xF30A. * - 0x2 - - \_\_le16 - - eh\_entries + - __le16 + - eh_entries - Number of valid entries following the header. * - 0x4 - - \_\_le16 - - eh\_max + - __le16 + - eh_max - Maximum number of entries that could follow the header. * - 0x6 - - \_\_le16 - - eh\_depth + - __le16 + - eh_depth - Depth of this extent node in the extent tree. 0 = this extent node points to data blocks; otherwise, this extent node points to other extent nodes. The extent tree can be at most 5 levels deep: a logical block number can be at most ``2^32``, and the smallest ``n`` that satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5. * - 0x8 - - \_\_le32 - - eh\_generation + - __le32 + - eh_generation - Generation of the tree. (Used by Lustre, but not standard ext4). Internal nodes of the extent tree, also known as index nodes, are @@ -112,22 +112,22 @@ recorded as ``struct ext4_extent_idx``, and are 12 bytes long: - Name - Description * - 0x0 - - \_\_le32 - - ei\_block + - __le32 + - ei_block - This index node covers file blocks from 'block' onward. * - 0x4 - - \_\_le32 - - ei\_leaf\_lo + - __le32 + - ei_leaf_lo - Lower 32-bits of the block number of the extent node that is the next level lower in the tree. The tree node pointed to can be either another internal node or a leaf node, described below. * - 0x8 - - \_\_le16 - - ei\_leaf\_hi + - __le16 + - ei_leaf_hi - Upper 16-bits of the previous field. * - 0xA - - \_\_u16 - - ei\_unused + - __u16 + - ei_unused - Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, @@ -142,24 +142,24 @@ and are also 12 bytes long: - Name - Description * - 0x0 - - \_\_le32 - - ee\_block + - __le32 + - ee_block - First file block number that this extent covers. * - 0x4 - - \_\_le16 - - ee\_len + - __le16 + - ee_len - Number of blocks covered by extent. If the value of this field is <= 32768, the extent is initialized. If the value of the field is > 32768, the extent is uninitialized and the actual extent length is ``ee_len`` - 32768. Therefore, the maximum length of a initialized extent is 32768 blocks, and the maximum length of an uninitialized extent is 32767. * - 0x6 - - \_\_le16 - - ee\_start\_hi + - __le16 + - ee_start_hi - Upper 16-bits of the block number to which this extent points. * - 0x8 - - \_\_le32 - - ee\_start\_lo + - __le32 + - ee_start_lo - Lower 32-bits of the block number to which this extent points. Prior to the introduction of metadata checksums, the extent header + @@ -182,8 +182,8 @@ including) the checksum itself. - Name - Description * - 0x0 - - \_\_le32 - - eb\_checksum + - __le32 + - eb_checksum - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock) Inline Data diff --git a/Documentation/filesystems/ext4/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst index d1075178ce0b..a728af0d2fd0 100644 --- a/Documentation/filesystems/ext4/inlinedata.rst +++ b/Documentation/filesystems/ext4/inlinedata.rst @@ -11,12 +11,12 @@ file is smaller than 60 bytes, then the data are stored inline in attribute space, then it might be found as an extended attribute “system.data” within the inode body (“ibody EA”). This of course constrains the amount of extended attributes one can attach to an inode. -If the data size increases beyond i\_block + ibody EA, a regular block +If the data size increases beyond i_block + ibody EA, a regular block is allocated and the contents moved to that block. Pending a change to compact the extended attribute key used to store inline data, one ought to be able to store 160 bytes of data in a -256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to +256-byte inode (as of June 2015, when i_extra_isize is 28). Prior to that, the limit was 156 bytes due to inefficient use of inode space. The inline data feature requires the presence of an extended attribute @@ -25,12 +25,12 @@ for “system.data”, even if the attribute value is zero length. Inline Directories ~~~~~~~~~~~~~~~~~~ -The first four bytes of i\_block are the inode number of the parent +The first four bytes of i_block are the inode number of the parent directory. Following that is a 56-byte space for an array of directory entries; see ``struct ext4_dir_entry``. If there is a “system.data” attribute in the inode body, the EA value is an array of ``struct ext4_dir_entry`` as well. Note that for inline directories, the -i\_block and EA space are treated as separate dirent blocks; directory +i_block and EA space are treated as separate dirent blocks; directory entries cannot span the two. Inline directory entries are not checksummed, as the inode checksum diff --git a/Documentation/filesystems/ext4/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index 6c5ce666e63f..cfc6c1659931 100644 --- a/Documentation/filesystems/ext4/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst @@ -38,138 +38,138 @@ The inode table entry is laid out in ``struct ext4_inode``. - Name - Description * - 0x0 - - \_\_le16 - - i\_mode + - __le16 + - i_mode - File mode. See the table i_mode_ below. * - 0x2 - - \_\_le16 - - i\_uid + - __le16 + - i_uid - Lower 16-bits of Owner UID. * - 0x4 - - \_\_le32 - - i\_size\_lo + - __le32 + - i_size_lo - Lower 32-bits of size in bytes. * - 0x8 - - \_\_le32 - - i\_atime - - Last access time, in seconds since the epoch. However, if the EA\_INODE + - __le32 + - i_atime + - Last access time, in seconds since the epoch. However, if the EA_INODE inode flag is set, this inode stores an extended attribute value and this field contains the checksum of the value. * - 0xC - - \_\_le32 - - i\_ctime + - __le32 + - i_ctime - Last inode change time, in seconds since the epoch. However, if the - EA\_INODE inode flag is set, this inode stores an extended attribute + EA_INODE inode flag is set, this inode stores an extended attribute value and this field contains the lower 32 bits of the attribute value's reference count. * - 0x10 - - \_\_le32 - - i\_mtime + - __le32 + - i_mtime - Last data modification time, in seconds since the epoch. However, if the - EA\_INODE inode flag is set, this inode stores an extended attribute + EA_INODE inode flag is set, this inode stores an extended attribute value and this field contains the number of the inode that owns the extended attribute. * - 0x14 - - \_\_le32 - - i\_dtime + - __le32 + - i_dtime - Deletion Time, in seconds since the epoch. * - 0x18 - - \_\_le16 - - i\_gid + - __le16 + - i_gid - Lower 16-bits of GID. * - 0x1A - - \_\_le16 - - i\_links\_count + - __le16 + - i_links_count - Hard link count. Normally, ext4 does not permit an inode to have more than 65,000 hard links. This applies to files as well as directories, which means that there cannot be more than 64,998 subdirectories in a directory (each subdirectory's '..' entry counts as a hard link, as does - the '.' entry in the directory itself). With the DIR\_NLINK feature + the '.' entry in the directory itself). With the DIR_NLINK feature enabled, ext4 supports more than 64,998 subdirectories by setting this field to 1 to indicate that the number of hard links is not known. * - 0x1C - - \_\_le32 - - i\_blocks\_lo - - Lower 32-bits of “block” count. If the huge\_file feature flag is not + - __le32 + - i_blocks_lo + - Lower 32-bits of “block” count. If the huge_file feature flag is not set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks - on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in + on disk. If huge_file is set and EXT4_HUGE_FILE_FL is NOT set in ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi - << 32)`` 512-byte blocks on disk. If huge\_file is set and - EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file + << 32)`` 512-byte blocks on disk. If huge_file is set and + EXT4_HUGE_FILE_FL IS set in ``inode.i_flags``, then this file consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on disk. * - 0x20 - - \_\_le32 - - i\_flags + - __le32 + - i_flags - Inode flags. See the table i_flags_ below. * - 0x24 - 4 bytes - - i\_osd1 + - i_osd1 - See the table i_osd1_ for more details. * - 0x28 - 60 bytes - - i\_block[EXT4\_N\_BLOCKS=15] - - Block map or extent tree. See the section “The Contents of inode.i\_block”. + - i_block[EXT4_N_BLOCKS=15] + - Block map or extent tree. See the section “The Contents of inode.i_block”. * - 0x64 - - \_\_le32 - - i\_generation + - __le32 + - i_generation - File version (for NFS). * - 0x68 - - \_\_le32 - - i\_file\_acl\_lo + - __le32 + - i_file_acl_lo - Lower 32-bits of extended attribute block. ACLs are of course one of many possible extended attributes; I think the name of this field is a result of the first use of extended attributes being for ACLs. * - 0x6C - - \_\_le32 - - i\_size\_high / i\_dir\_acl + - __le32 + - i_size_high / i_dir_acl - Upper 32-bits of file/directory size. In ext2/3 this field was named - i\_dir\_acl, though it was usually set to zero and never used. + i_dir_acl, though it was usually set to zero and never used. * - 0x70 - - \_\_le32 - - i\_obso\_faddr + - __le32 + - i_obso_faddr - (Obsolete) fragment address. * - 0x74 - 12 bytes - - i\_osd2 + - i_osd2 - See the table i_osd2_ for more details. * - 0x80 - - \_\_le16 - - i\_extra\_isize + - __le16 + - i_extra_isize - Size of this inode - 128. Alternately, the size of the extended inode fields beyond the original ext2 inode, including this field. * - 0x82 - - \_\_le16 - - i\_checksum\_hi + - __le16 + - i_checksum_hi - Upper 16-bits of the inode checksum. * - 0x84 - - \_\_le32 - - i\_ctime\_extra + - __le32 + - i_ctime_extra - Extra change time bits. This provides sub-second precision. See Inode Timestamps section. * - 0x88 - - \_\_le32 - - i\_mtime\_extra + - __le32 + - i_mtime_extra - Extra modification time bits. This provides sub-second precision. * - 0x8C - - \_\_le32 - - i\_atime\_extra + - __le32 + - i_atime_extra - Extra access time bits. This provides sub-second precision. * - 0x90 - - \_\_le32 - - i\_crtime + - __le32 + - i_crtime - File creation time, in seconds since the epoch. * - 0x94 - - \_\_le32 - - i\_crtime\_extra + - __le32 + - i_crtime_extra - Extra file creation time bits. This provides sub-second precision. * - 0x98 - - \_\_le32 - - i\_version\_hi + - __le32 + - i_version_hi - Upper 32-bits for version number. * - 0x9C - - \_\_le32 - - i\_projid + - __le32 + - i_projid - Project ID. .. _i_mode: @@ -183,45 +183,45 @@ The ``i_mode`` value is a combination of the following flags: * - Value - Description * - 0x1 - - S\_IXOTH (Others may execute) + - S_IXOTH (Others may execute) * - 0x2 - - S\_IWOTH (Others may write) + - S_IWOTH (Others may write) * - 0x4 - - S\_IROTH (Others may read) + - S_IROTH (Others may read) * - 0x8 - - S\_IXGRP (Group members may execute) + - S_IXGRP (Group members may execute) * - 0x10 - - S\_IWGRP (Group members may write) + - S_IWGRP (Group members may write) * - 0x20 - - S\_IRGRP (Group members may read) + - S_IRGRP (Group members may read) * - 0x40 - - S\_IXUSR (Owner may execute) + - S_IXUSR (Owner may execute) * - 0x80 - - S\_IWUSR (Owner may write) + - S_IWUSR (Owner may write) * - 0x100 - - S\_IRUSR (Owner may read) + - S_IRUSR (Owner may read) * - 0x200 - - S\_ISVTX (Sticky bit) + - S_ISVTX (Sticky bit) * - 0x400 - - S\_ISGID (Set GID) + - S_ISGID (Set GID) * - 0x800 - - S\_ISUID (Set UID) + - S_ISUID (Set UID) * - - These are mutually-exclusive file types: * - 0x1000 - - S\_IFIFO (FIFO) + - S_IFIFO (FIFO) * - 0x2000 - - S\_IFCHR (Character device) + - S_IFCHR (Character device) * - 0x4000 - - S\_IFDIR (Directory) + - S_IFDIR (Directory) * - 0x6000 - - S\_IFBLK (Block device) + - S_IFBLK (Block device) * - 0x8000 - - S\_IFREG (Regular file) + - S_IFREG (Regular file) * - 0xA000 - - S\_IFLNK (Symbolic link) + - S_IFLNK (Symbolic link) * - 0xC000 - - S\_IFSOCK (Socket) + - S_IFSOCK (Socket) .. _i_flags: @@ -234,56 +234,56 @@ The ``i_flags`` field is a combination of these values: * - Value - Description * - 0x1 - - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented) + - This file requires secure deletion (EXT4_SECRM_FL). (not implemented) * - 0x2 - This file should be preserved, should undeletion be desired - (EXT4\_UNRM\_FL). (not implemented) + (EXT4_UNRM_FL). (not implemented) * - 0x4 - - File is compressed (EXT4\_COMPR\_FL). (not really implemented) + - File is compressed (EXT4_COMPR_FL). (not really implemented) * - 0x8 - - All writes to the file must be synchronous (EXT4\_SYNC\_FL). + - All writes to the file must be synchronous (EXT4_SYNC_FL). * - 0x10 - - File is immutable (EXT4\_IMMUTABLE\_FL). + - File is immutable (EXT4_IMMUTABLE_FL). * - 0x20 - - File can only be appended (EXT4\_APPEND\_FL). + - File can only be appended (EXT4_APPEND_FL). * - 0x40 - - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL). + - The dump(1) utility should not dump this file (EXT4_NODUMP_FL). * - 0x80 - - Do not update access time (EXT4\_NOATIME\_FL). + - Do not update access time (EXT4_NOATIME_FL). * - 0x100 - - Dirty compressed file (EXT4\_DIRTY\_FL). (not used) + - Dirty compressed file (EXT4_DIRTY_FL). (not used) * - 0x200 - - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used) + - File has one or more compressed clusters (EXT4_COMPRBLK_FL). (not used) * - 0x400 - - Do not compress file (EXT4\_NOCOMPR\_FL). (not used) + - Do not compress file (EXT4_NOCOMPR_FL). (not used) * - 0x800 - - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was - EXT4\_ECOMPR\_FL (compression error), which was never used. + - Encrypted inode (EXT4_ENCRYPT_FL). This bit value previously was + EXT4_ECOMPR_FL (compression error), which was never used. * - 0x1000 - - Directory has hashed indexes (EXT4\_INDEX\_FL). + - Directory has hashed indexes (EXT4_INDEX_FL). * - 0x2000 - - AFS magic directory (EXT4\_IMAGIC\_FL). + - AFS magic directory (EXT4_IMAGIC_FL). * - 0x4000 - File data must always be written through the journal - (EXT4\_JOURNAL\_DATA\_FL). + (EXT4_JOURNAL_DATA_FL). * - 0x8000 - - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4) + - File tail should not be merged (EXT4_NOTAIL_FL). (not used by ext4) * - 0x10000 - All directory entry data should be written synchronously (see - ``dirsync``) (EXT4\_DIRSYNC\_FL). + ``dirsync``) (EXT4_DIRSYNC_FL). * - 0x20000 - - Top of directory hierarchy (EXT4\_TOPDIR\_FL). + - Top of directory hierarchy (EXT4_TOPDIR_FL). * - 0x40000 - - This is a huge file (EXT4\_HUGE\_FILE\_FL). + - This is a huge file (EXT4_HUGE_FILE_FL). * - 0x80000 - - Inode uses extents (EXT4\_EXTENTS\_FL). + - Inode uses extents (EXT4_EXTENTS_FL). * - 0x100000 - - Verity protected file (EXT4\_VERITY\_FL). + - Verity protected file (EXT4_VERITY_FL). * - 0x200000 - Inode stores a large extended attribute value in its data blocks - (EXT4\_EA\_INODE\_FL). + (EXT4_EA_INODE_FL). * - 0x400000 - - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL). + - This file has blocks allocated past EOF (EXT4_EOFBLOCKS_FL). (deprecated) * - 0x01000000 - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline) @@ -294,21 +294,21 @@ The ``i_flags`` field is a combination of these values: - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in mainline) * - 0x10000000 - - Inode has inline data (EXT4\_INLINE\_DATA\_FL). + - Inode has inline data (EXT4_INLINE_DATA_FL). * - 0x20000000 - - Create children with the same project ID (EXT4\_PROJINHERIT\_FL). + - Create children with the same project ID (EXT4_PROJINHERIT_FL). * - 0x80000000 - - Reserved for ext4 library (EXT4\_RESERVED\_FL). + - Reserved for ext4 library (EXT4_RESERVED_FL). * - - Aggregate flags: * - 0x705BDFFF - User-visible flags. * - 0x604BC0FF - - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and - EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's - EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of + - User-modifiable flags. Note that while EXT4_JOURNAL_DATA_FL and + EXT4_EXTENTS_FL can be set with setattr, they are not in the kernel's + EXT4_FL_USER_MODIFIABLE mask, since it needs to handle the setting of these flags in a special manner and they are masked out of the set of - flags that are saved directly to i\_flags. + flags that are saved directly to i_flags. .. _i_osd1: @@ -325,9 +325,9 @@ Linux: - Name - Description * - 0x0 - - \_\_le32 - - l\_i\_version - - Inode version. However, if the EA\_INODE inode flag is set, this inode + - __le32 + - l_i_version + - Inode version. However, if the EA_INODE inode flag is set, this inode stores an extended attribute value and this field contains the upper 32 bits of the attribute value's reference count. @@ -342,8 +342,8 @@ Hurd: - Name - Description * - 0x0 - - \_\_le32 - - h\_i\_translator + - __le32 + - h_i_translator - ?? Masix: @@ -357,8 +357,8 @@ Masix: - Name - Description * - 0x0 - - \_\_le32 - - m\_i\_reserved + - __le32 + - m_i_reserved - ?? .. _i_osd2: @@ -376,30 +376,30 @@ Linux: - Name - Description * - 0x0 - - \_\_le16 - - l\_i\_blocks\_high + - __le16 + - l_i_blocks_high - Upper 16-bits of the block count. Please see the note attached to - i\_blocks\_lo. + i_blocks_lo. * - 0x2 - - \_\_le16 - - l\_i\_file\_acl\_high + - __le16 + - l_i_file_acl_high - Upper 16-bits of the extended attribute block (historically, the file ACL location). See the Extended Attributes section below. * - 0x4 - - \_\_le16 - - l\_i\_uid\_high + - __le16 + - l_i_uid_high - Upper 16-bits of the Owner UID. * - 0x6 - - \_\_le16 - - l\_i\_gid\_high + - __le16 + - l_i_gid_high - Upper 16-bits of the GID. * - 0x8 - - \_\_le16 - - l\_i\_checksum\_lo + - __le16 + - l_i_checksum_lo - Lower 16-bits of the inode checksum. * - 0xA - - \_\_le16 - - l\_i\_reserved + - __le16 + - l_i_reserved - Unused. Hurd: @@ -413,24 +413,24 @@ Hurd: - Name - Description * - 0x0 - - \_\_le16 - - h\_i\_reserved1 + - __le16 + - h_i_reserved1 - ?? * - 0x2 - - \_\_u16 - - h\_i\_mode\_high + - __u16 + - h_i_mode_high - Upper 16-bits of the file mode. * - 0x4 - - \_\_le16 - - h\_i\_uid\_high + - __le16 + - h_i_uid_high - Upper 16-bits of the Owner UID. * - 0x6 - - \_\_le16 - - h\_i\_gid\_high + - __le16 + - h_i_gid_high - Upper 16-bits of the GID. * - 0x8 - - \_\_u32 - - h\_i\_author + - __u32 + - h_i_author - Author code? Masix: @@ -444,17 +444,17 @@ Masix: - Name - Description * - 0x0 - - \_\_le16 - - h\_i\_reserved1 + - __le16 + - h_i_reserved1 - ?? * - 0x2 - - \_\_u16 - - m\_i\_file\_acl\_high + - __u16 + - m_i_file_acl_high - Upper 16-bits of the extended attribute block (historically, the file ACL location). * - 0x4 - - \_\_u32 - - m\_i\_reserved2[2] + - __u32 + - m_i_reserved2[2] - ?? Inode Size @@ -466,11 +466,11 @@ In ext2 and ext3, the inode structure size was fixed at 128 bytes on-disk inode at format time for all inodes in the filesystem to provide space beyond the end of the original ext2 inode. The on-disk inode record size is recorded in the superblock as ``s_inode_size``. The -number of bytes actually used by struct ext4\_inode beyond the original +number of bytes actually used by struct ext4_inode beyond the original 128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each -inode, which allows struct ext4\_inode to grow for a new kernel without +inode, which allows struct ext4_inode to grow for a new kernel without having to upgrade all of the on-disk inodes. Access to fields beyond -EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within +EXT2_GOOD_OLD_INODE_SIZE should be verified to be within ``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as of August 2019) the inode structure is 160 bytes (``i_extra_isize = 32``). The extra space between the end of the inode @@ -516,7 +516,7 @@ creation time (crtime); this field is 64-bits wide and decoded in the same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible through the regular stat() interface, though debugfs will report them. -We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)). +We use the 32-bit signed time value plus (2^32 * (extra epoch bits)). In other words: .. list-table:: @@ -525,8 +525,8 @@ In other words: * - Extra epoch bits - MSB of 32-bit time - - Adjustment for signed 32-bit to 64-bit tv\_sec - - Decoded 64-bit tv\_sec + - Adjustment for signed 32-bit to 64-bit tv_sec + - Decoded 64-bit tv_sec - valid time range * - 0 0 - 1 diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst index 5fad38860f17..a6bef5293a60 100644 --- a/Documentation/filesystems/ext4/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst @@ -63,8 +63,8 @@ Generally speaking, the journal has this format: :header-rows: 1 * - Superblock - - descriptor\_block (data\_blocks or revocation\_block) [more data or - revocations] commmit\_block + - descriptor_block (data_blocks or revocation_block) [more data or + revocations] commmit_block - [more transactions...] * - - One transaction @@ -93,8 +93,8 @@ superblock. * - 1024 bytes of padding - ext4 Superblock - Journal Superblock - - descriptor\_block (data\_blocks or revocation\_block) [more data or - revocations] commmit\_block + - descriptor_block (data_blocks or revocation_block) [more data or + revocations] commmit_block - [more transactions...] * - - @@ -117,17 +117,17 @@ Every block in the journal starts with a common 12-byte header - Name - Description * - 0x0 - - \_\_be32 - - h\_magic + - __be32 + - h_magic - jbd2 magic number, 0xC03B3998. * - 0x4 - - \_\_be32 - - h\_blocktype + - __be32 + - h_blocktype - Description of what this block contains. See the jbd2_blocktype_ table below. * - 0x8 - - \_\_be32 - - h\_sequence + - __be32 + - h_sequence - The transaction ID that goes with this block. .. _jbd2_blocktype: @@ -177,99 +177,99 @@ which is 1024 bytes long: - - Static information describing the journal. * - 0x0 - - journal\_header\_t (12 bytes) - - s\_header + - journal_header_t (12 bytes) + - s_header - Common header identifying this as a superblock. * - 0xC - - \_\_be32 - - s\_blocksize + - __be32 + - s_blocksize - Journal device block size. * - 0x10 - - \_\_be32 - - s\_maxlen + - __be32 + - s_maxlen - Total number of blocks in this journal. * - 0x14 - - \_\_be32 - - s\_first + - __be32 + - s_first - First block of log information. * - - - - Dynamic information describing the current state of the log. * - 0x18 - - \_\_be32 - - s\_sequence + - __be32 + - s_sequence - First commit ID expected in log. * - 0x1C - - \_\_be32 - - s\_start + - __be32 + - s_start - Block number of the start of log. Contrary to the comments, this field being zero does not imply that the journal is clean! * - 0x20 - - \_\_be32 - - s\_errno - - Error value, as set by jbd2\_journal\_abort(). + - __be32 + - s_errno + - Error value, as set by jbd2_journal_abort(). * - - - - The remaining fields are only valid in a v2 superblock. * - 0x24 - - \_\_be32 - - s\_feature\_compat; + - __be32 + - s_feature_compat; - Compatible feature set. See the table jbd2_compat_ below. * - 0x28 - - \_\_be32 - - s\_feature\_incompat + - __be32 + - s_feature_incompat - Incompatible feature set. See the table jbd2_incompat_ below. * - 0x2C - - \_\_be32 - - s\_feature\_ro\_compat + - __be32 + - s_feature_ro_compat - Read-only compatible feature set. There aren't any of these currently. * - 0x30 - - \_\_u8 - - s\_uuid[16] + - __u8 + - s_uuid[16] - 128-bit uuid for journal. This is compared against the copy in the ext4 super block at mount time. * - 0x40 - - \_\_be32 - - s\_nr\_users + - __be32 + - s_nr_users - Number of file systems sharing this journal. * - 0x44 - - \_\_be32 - - s\_dynsuper + - __be32 + - s_dynsuper - Location of dynamic super block copy. (Not used?) * - 0x48 - - \_\_be32 - - s\_max\_transaction + - __be32 + - s_max_transaction - Limit of journal blocks per transaction. (Not used?) * - 0x4C - - \_\_be32 - - s\_max\_trans\_data + - __be32 + - s_max_trans_data - Limit of data blocks per transaction. (Not used?) * - 0x50 - - \_\_u8 - - s\_checksum\_type + - __u8 + - s_checksum_type - Checksum algorithm used for the journal. See jbd2_checksum_type_ for more info. * - 0x51 - - \_\_u8[3] - - s\_padding2 + - __u8[3] + - s_padding2 - * - 0x54 - - \_\_be32 - - s\_num\_fc\_blocks + - __be32 + - s_num_fc_blocks - Number of fast commit blocks in the journal. * - 0x58 - - \_\_u32 - - s\_padding[42] + - __u32 + - s_padding[42] - * - 0xFC - - \_\_be32 - - s\_checksum + - __be32 + - s_checksum - Checksum of the entire superblock, with this field set to zero. * - 0x100 - - \_\_u8 - - s\_users[16\*48] + - __u8 + - s_users[16*48] - ids of all file systems sharing the log. e2fsprogs/Linux don't allow shared external journals, but I imagine Lustre (or ocfs2?), which use the jbd2 code, might. @@ -286,7 +286,7 @@ The journal compat features are any combination of the following: - Description * - 0x1 - Journal maintains checksums on the data blocks. - (JBD2\_FEATURE\_COMPAT\_CHECKSUM) + (JBD2_FEATURE_COMPAT_CHECKSUM) .. _jbd2_incompat: @@ -299,23 +299,23 @@ The journal incompat features are any combination of the following: * - Value - Description * - 0x1 - - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE) + - Journal has block revocation records. (JBD2_FEATURE_INCOMPAT_REVOKE) * - 0x2 - Journal can deal with 64-bit block numbers. - (JBD2\_FEATURE\_INCOMPAT\_64BIT) + (JBD2_FEATURE_INCOMPAT_64BIT) * - 0x4 - - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT) + - Journal commits asynchronously. (JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) * - 0x8 - This journal uses v2 of the checksum on-disk format. Each journal metadata block gets its own checksum, and the block tags in the descriptor table contain checksums for each of the data blocks in the - journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2) + journal. (JBD2_FEATURE_INCOMPAT_CSUM_V2) * - 0x10 - This journal uses v3 of the checksum on-disk format. This is the same as v2, but the journal block tag size is fixed regardless of the size of - block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) + block numbers. (JBD2_FEATURE_INCOMPAT_CSUM_V3) * - 0x20 - - Journal has fast commit blocks. (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) + - Journal has fast commit blocks. (JBD2_FEATURE_INCOMPAT_FAST_COMMIT) .. _jbd2_checksum_type: @@ -355,11 +355,11 @@ Descriptor blocks consume at least 36 bytes, but use a full block: - Name - Descriptor * - 0x0 - - journal\_header\_t + - journal_header_t - (open coded) - Common block header. * - 0xC - - struct journal\_block\_tag\_s + - struct journal_block_tag_s - open coded array[] - Enough tags either to fill up the block or to describe all the data blocks that follow this descriptor block. @@ -367,7 +367,7 @@ Descriptor blocks consume at least 36 bytes, but use a full block: Journal block tags have any of the following formats, depending on which journal feature and block tag flags are set. -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is +If JBD2_FEATURE_INCOMPAT_CSUM_V3 is set, the journal block tag is defined as ``struct journal_block_tag3_s``, which looks like the following. The size is 16 or 32 bytes. @@ -380,24 +380,24 @@ following. The size is 16 or 32 bytes. - Name - Descriptor * - 0x0 - - \_\_be32 - - t\_blocknr + - __be32 + - t_blocknr - Lower 32-bits of the location of where the corresponding data block should end up on disk. * - 0x4 - - \_\_be32 - - t\_flags + - __be32 + - t_flags - Flags that go with the descriptor. See the table jbd2_tag_flags_ for more info. * - 0x8 - - \_\_be32 - - t\_blocknr\_high + - __be32 + - t_blocknr_high - Upper 32-bits of the location of where the corresponding data block - should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is + should end up on disk. This is zero if JBD2_FEATURE_INCOMPAT_64BIT is not enabled. * - 0xC - - \_\_be32 - - t\_checksum + - __be32 + - t_checksum - Checksum of the journal UUID, the sequence number, and the data block. * - - @@ -433,7 +433,7 @@ The journal tag flags are any combination of the following: * - 0x8 - This is the last tag in this descriptor block. -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag +If JBD2_FEATURE_INCOMPAT_CSUM_V3 is NOT set, the journal block tag is defined as ``struct journal_block_tag_s``, which looks like the following. The size is 8, 12, 24, or 28 bytes: @@ -446,18 +446,18 @@ following. The size is 8, 12, 24, or 28 bytes: - Name - Descriptor * - 0x0 - - \_\_be32 - - t\_blocknr + - __be32 + - t_blocknr - Lower 32-bits of the location of where the corresponding data block should end up on disk. * - 0x4 - - \_\_be16 - - t\_checksum + - __be16 + - t_checksum - Checksum of the journal UUID, the sequence number, and the data block. Note that only the lower 16 bits are stored. * - 0x6 - - \_\_be16 - - t\_flags + - __be16 + - t_flags - Flags that go with the descriptor. See the table jbd2_tag_flags_ for more info. * - @@ -466,8 +466,8 @@ following. The size is 8, 12, 24, or 28 bytes: - This next field is only present if the super block indicates support for 64-bit block numbers. * - 0x8 - - \_\_be32 - - t\_blocknr\_high + - __be32 + - t_blocknr_high - Upper 32-bits of the location of where the corresponding data block should end up on disk. * - @@ -483,8 +483,8 @@ following. The size is 8, 12, 24, or 28 bytes: ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that field. -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or -JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a +If JBD2_FEATURE_INCOMPAT_CSUM_V2 or +JBD2_FEATURE_INCOMPAT_CSUM_V3 are set, the end of the block is a ``struct jbd2_journal_block_tail``, which looks like this: .. list-table:: @@ -496,8 +496,8 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a - Name - Descriptor * - 0x0 - - \_\_be32 - - t\_checksum + - __be32 + - t_checksum - Checksum of the journal UUID + the descriptor block, with this field set to zero. @@ -538,25 +538,25 @@ length, but use a full block: - Name - Description * - 0x0 - - journal\_header\_t - - r\_header + - journal_header_t + - r_header - Common block header. * - 0xC - - \_\_be32 - - r\_count + - __be32 + - r_count - Number of bytes used in this block. * - 0x10 - - \_\_be32 or \_\_be64 + - __be32 or __be64 - blocks[0] - Blocks to revoke. -After r\_count is a linear array of block numbers that are effectively +After r_count is a linear array of block numbers that are effectively revoked by this transaction. The size of each block number is 8 bytes if the superblock advertises 64-bit block number support, or 4 bytes otherwise. -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or -JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation +If JBD2_FEATURE_INCOMPAT_CSUM_V2 or +JBD2_FEATURE_INCOMPAT_CSUM_V3 are set, the end of the revocation block is a ``struct jbd2_journal_revoke_tail``, which has this format: .. list-table:: @@ -568,8 +568,8 @@ block is a ``struct jbd2_journal_revoke_tail``, which has this format: - Name - Description * - 0x0 - - \_\_be32 - - r\_checksum + - __be32 + - r_checksum - Checksum of the journal UUID + revocation block Commit Block @@ -592,38 +592,38 @@ bytes long (but uses a full block): - Name - Descriptor * - 0x0 - - journal\_header\_s + - journal_header_s - (open coded) - Common block header. * - 0xC - unsigned char - - h\_chksum\_type + - h_chksum_type - The type of checksum to use to verify the integrity of the data blocks in the transaction. See jbd2_checksum_type_ for more info. * - 0xD - unsigned char - - h\_chksum\_size + - h_chksum_size - The number of bytes used by the checksum. Most likely 4. * - 0xE - unsigned char - - h\_padding[2] + - h_padding[2] - * - 0x10 - - \_\_be32 - - h\_chksum[JBD2\_CHECKSUM\_BYTES] + - __be32 + - h_chksum[JBD2_CHECKSUM_BYTES] - 32 bytes of space to store checksums. If - JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 + JBD2_FEATURE_INCOMPAT_CSUM_V2 or JBD2_FEATURE_INCOMPAT_CSUM_V3 are set, the first ``__be32`` is the checksum of the journal UUID and the entire commit block, with this field zeroed. If - JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the + JBD2_FEATURE_COMPAT_CHECKSUM is set, the first ``__be32`` is the crc32 of all the blocks already written to the transaction. * - 0x30 - - \_\_be64 - - h\_commit\_sec + - __be64 + - h_commit_sec - The time that the transaction was committed, in seconds since the epoch. * - 0x38 - - \_\_be32 - - h\_commit\_nsec + - __be32 + - h_commit_nsec - Nanoseconds component of the above timestamp. Fast commits diff --git a/Documentation/filesystems/ext4/mmp.rst b/Documentation/filesystems/ext4/mmp.rst index 25660981d93c..174dd6538737 100644 --- a/Documentation/filesystems/ext4/mmp.rst +++ b/Documentation/filesystems/ext4/mmp.rst @@ -7,8 +7,8 @@ Multiple mount protection (MMP) is a feature that protects the filesystem against multiple hosts trying to use the filesystem simultaneously. When a filesystem is opened (for mounting, or fsck, etc.), the MMP code running on the node (call it node A) checks a -sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the -open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then +sequence number. If the sequence number is EXT4_MMP_SEQ_CLEAN, the +open continues. If the sequence number is EXT4_MMP_SEQ_FSCK, then fsck is (hopefully) running, and open fails immediately. Otherwise, the open code will wait for twice the specified MMP check interval and check the sequence number again. If the sequence number has changed, then the @@ -40,38 +40,38 @@ The MMP structure (``struct mmp_struct``) is as follows: - Name - Description * - 0x0 - - \_\_le32 - - mmp\_magic + - __le32 + - mmp_magic - Magic number for MMP, 0x004D4D50 (“MMP”). * - 0x4 - - \_\_le32 - - mmp\_seq + - __le32 + - mmp_seq - Sequence number, updated periodically. * - 0x8 - - \_\_le64 - - mmp\_time + - __le64 + - mmp_time - Time that the MMP block was last updated. * - 0x10 - char[64] - - mmp\_nodename + - mmp_nodename - Hostname of the node that opened the filesystem. * - 0x50 - char[32] - - mmp\_bdevname + - mmp_bdevname - Block device name of the filesystem. * - 0x70 - - \_\_le16 - - mmp\_check\_interval + - __le16 + - mmp_check_interval - The MMP re-check interval, in seconds. * - 0x72 - - \_\_le16 - - mmp\_pad1 + - __le16 + - mmp_pad1 - Zero. * - 0x74 - - \_\_le32[226] - - mmp\_pad2 + - __le32[226] + - mmp_pad2 - Zero. * - 0x3FC - - \_\_le32 - - mmp\_checksum + - __le32 + - mmp_checksum - Checksum of the MMP block. diff --git a/Documentation/filesystems/ext4/overview.rst b/Documentation/filesystems/ext4/overview.rst index 123ebfde47ee..0fad6eda6e15 100644 --- a/Documentation/filesystems/ext4/overview.rst +++ b/Documentation/filesystems/ext4/overview.rst @@ -7,7 +7,7 @@ An ext4 file system is split into a series of block groups. To reduce performance difficulties due to fragmentation, the block allocator tries very hard to keep each file's blocks within the same group, thereby reducing seek times. The size of a block group is specified in -``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \* +``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 * ``block_size_in_bytes``. With the default block size of 4KiB, each group will contain 32,768 blocks, for a length of 128MiB. The number of block groups is the size of the device divided by the size of a block group. diff --git a/Documentation/filesystems/ext4/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst index 94f304e3a0a7..fc0636901fa0 100644 --- a/Documentation/filesystems/ext4/special_inodes.rst +++ b/Documentation/filesystems/ext4/special_inodes.rst @@ -34,7 +34,7 @@ ext4 reserves some inode for special features, as follows: * - 10 - Replica inode, used for some non-upstream feature? * - 11 - - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. + - Traditional first non-reserved inode. Usually this is the lost+found directory. See s_first_ino in the superblock. Note that there are also some inodes allocated from non-reserved inode numbers for other filesystem features which are not referenced from standard directory @@ -47,9 +47,9 @@ hierarchy. These are generally reference from the superblock. They are: * - Superblock field - Description - * - s\_lpf\_ino + * - s_lpf_ino - Inode number of lost+found directory. - * - s\_prj\_quota\_inum + * - s_prj_quota_inum - Inode number of quota file tracking project quotas - * - s\_orphan\_file\_inum + * - s_orphan_file_inum - Inode number of file tracking orphan inodes. diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst index f6a548e957bb..268888522e35 100644 --- a/Documentation/filesystems/ext4/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -7,7 +7,7 @@ The superblock records various information about the enclosing filesystem, such as block counts, inode counts, supported features, maintenance information, and more. -If the sparse\_super feature flag is set, redundant copies of the +If the sparse_super feature flag is set, redundant copies of the superblock and group descriptors are kept only in the groups whose group number is either 0 or a power of 3, 5, or 7. If the flag is not set, redundant copies are kept in all groups. @@ -27,107 +27,107 @@ The ext4 superblock is laid out as follows in - Name - Description * - 0x0 - - \_\_le32 - - s\_inodes\_count + - __le32 + - s_inodes_count - Total inode count. * - 0x4 - - \_\_le32 - - s\_blocks\_count\_lo + - __le32 + - s_blocks_count_lo - Total block count. * - 0x8 - - \_\_le32 - - s\_r\_blocks\_count\_lo + - __le32 + - s_r_blocks_count_lo - This number of blocks can only be allocated by the super-user. * - 0xC - - \_\_le32 - - s\_free\_blocks\_count\_lo + - __le32 + - s_free_blocks_count_lo - Free block count. * - 0x10 - - \_\_le32 - - s\_free\_inodes\_count + - __le32 + - s_free_inodes_count - Free inode count. * - 0x14 - - \_\_le32 - - s\_first\_data\_block + - __le32 + - s_first_data_block - First data block. This must be at least 1 for 1k-block filesystems and is typically 0 for all other block sizes. * - 0x18 - - \_\_le32 - - s\_log\_block\_size - - Block size is 2 ^ (10 + s\_log\_block\_size). + - __le32 + - s_log_block_size + - Block size is 2 ^ (10 + s_log_block_size). * - 0x1C - - \_\_le32 - - s\_log\_cluster\_size - - Cluster size is 2 ^ (10 + s\_log\_cluster\_size) blocks if bigalloc is - enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. + - __le32 + - s_log_cluster_size + - Cluster size is 2 ^ (10 + s_log_cluster_size) blocks if bigalloc is + enabled. Otherwise s_log_cluster_size must equal s_log_block_size. * - 0x20 - - \_\_le32 - - s\_blocks\_per\_group + - __le32 + - s_blocks_per_group - Blocks per group. * - 0x24 - - \_\_le32 - - s\_clusters\_per\_group + - __le32 + - s_clusters_per_group - Clusters per group, if bigalloc is enabled. Otherwise - s\_clusters\_per\_group must equal s\_blocks\_per\_group. + s_clusters_per_group must equal s_blocks_per_group. * - 0x28 - - \_\_le32 - - s\_inodes\_per\_group + - __le32 + - s_inodes_per_group - Inodes per group. * - 0x2C - - \_\_le32 - - s\_mtime + - __le32 + - s_mtime - Mount time, in seconds since the epoch. * - 0x30 - - \_\_le32 - - s\_wtime + - __le32 + - s_wtime - Write time, in seconds since the epoch. * - 0x34 - - \_\_le16 - - s\_mnt\_count + - __le16 + - s_mnt_count - Number of mounts since the last fsck. * - 0x36 - - \_\_le16 - - s\_max\_mnt\_count + - __le16 + - s_max_mnt_count - Number of mounts beyond which a fsck is needed. * - 0x38 - - \_\_le16 - - s\_magic + - __le16 + - s_magic - Magic signature, 0xEF53 * - 0x3A - - \_\_le16 - - s\_state + - __le16 + - s_state - File system state. See super_state_ for more info. * - 0x3C - - \_\_le16 - - s\_errors + - __le16 + - s_errors - Behaviour when detecting errors. See super_errors_ for more info. * - 0x3E - - \_\_le16 - - s\_minor\_rev\_level + - __le16 + - s_minor_rev_level - Minor revision level. * - 0x40 - - \_\_le32 - - s\_lastcheck + - __le32 + - s_lastcheck - Time of last check, in seconds since the epoch. * - 0x44 - - \_\_le32 - - s\_checkinterval + - __le32 + - s_checkinterval - Maximum time between checks, in seconds. * - 0x48 - - \_\_le32 - - s\_creator\_os + - __le32 + - s_creator_os - Creator OS. See the table super_creator_ for more info. * - 0x4C - - \_\_le32 - - s\_rev\_level + - __le32 + - s_rev_level - Revision level. See the table super_revision_ for more info. * - 0x50 - - \_\_le16 - - s\_def\_resuid + - __le16 + - s_def_resuid - Default uid for reserved blocks. * - 0x52 - - \_\_le16 - - s\_def\_resgid + - __le16 + - s_def_resgid - Default gid for reserved blocks. * - - @@ -143,50 +143,50 @@ The ext4 superblock is laid out as follows in about a feature in either the compatible or incompatible feature set, it must abort and not try to meddle with things it doesn't understand... * - 0x54 - - \_\_le32 - - s\_first\_ino + - __le32 + - s_first_ino - First non-reserved inode. * - 0x58 - - \_\_le16 - - s\_inode\_size + - __le16 + - s_inode_size - Size of inode structure, in bytes. * - 0x5A - - \_\_le16 - - s\_block\_group\_nr + - __le16 + - s_block_group_nr - Block group # of this superblock. * - 0x5C - - \_\_le32 - - s\_feature\_compat + - __le32 + - s_feature_compat - Compatible feature set flags. Kernel can still read/write this fs even if it doesn't understand a flag; fsck should not do that. See the super_compat_ table for more info. * - 0x60 - - \_\_le32 - - s\_feature\_incompat + - __le32 + - s_feature_incompat - Incompatible feature set. If the kernel or fsck doesn't understand one of these bits, it should stop. See the super_incompat_ table for more info. * - 0x64 - - \_\_le32 - - s\_feature\_ro\_compat + - __le32 + - s_feature_ro_compat - Readonly-compatible feature set. If the kernel doesn't understand one of these bits, it can still mount read-only. See the super_rocompat_ table for more info. * - 0x68 - - \_\_u8 - - s\_uuid[16] + - __u8 + - s_uuid[16] - 128-bit UUID for volume. * - 0x78 - char - - s\_volume\_name[16] + - s_volume_name[16] - Volume label. * - 0x88 - char - - s\_last\_mounted[64] + - s_last_mounted[64] - Directory where filesystem was last mounted. * - 0xC8 - - \_\_le32 - - s\_algorithm\_usage\_bitmap + - __le32 + - s_algorithm_usage_bitmap - For compression (Not used in e2fsprogs/Linux) * - - @@ -194,18 +194,18 @@ The ext4 superblock is laid out as follows in - Performance hints. Directory preallocation should only happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. * - 0xCC - - \_\_u8 - - s\_prealloc\_blocks + - __u8 + - s_prealloc_blocks - #. of blocks to try to preallocate for ... files? (Not used in e2fsprogs/Linux) * - 0xCD - - \_\_u8 - - s\_prealloc\_dir\_blocks + - __u8 + - s_prealloc_dir_blocks - #. of blocks to preallocate for directories. (Not used in e2fsprogs/Linux) * - 0xCE - - \_\_le16 - - s\_reserved\_gdt\_blocks + - __le16 + - s_reserved_gdt_blocks - Number of reserved GDT entries for future filesystem expansion. * - - @@ -213,281 +213,281 @@ The ext4 superblock is laid out as follows in - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is set. * - 0xD0 - - \_\_u8 - - s\_journal\_uuid[16] + - __u8 + - s_journal_uuid[16] - UUID of journal superblock * - 0xE0 - - \_\_le32 - - s\_journal\_inum + - __le32 + - s_journal_inum - inode number of journal file. * - 0xE4 - - \_\_le32 - - s\_journal\_dev + - __le32 + - s_journal_dev - Device number of journal file, if the external journal feature flag is set. * - 0xE8 - - \_\_le32 - - s\_last\_orphan + - __le32 + - s_last_orphan - Start of list of orphaned inodes to delete. * - 0xEC - - \_\_le32 - - s\_hash\_seed[4] + - __le32 + - s_hash_seed[4] - HTREE hash seed. * - 0xFC - - \_\_u8 - - s\_def\_hash\_version + - __u8 + - s_def_hash_version - Default hash algorithm to use for directory hashes. See super_def_hash_ for more info. * - 0xFD - - \_\_u8 - - s\_jnl\_backup\_type - - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the + - __u8 + - s_jnl_backup_type + - If this value is 0 or EXT3_JNL_BACKUP_BLOCKS (1), then the ``s_jnl_blocks`` field contains a duplicate copy of the inode's ``i_block[]`` array and ``i_size``. * - 0xFE - - \_\_le16 - - s\_desc\_size + - __le16 + - s_desc_size - Size of group descriptors, in bytes, if the 64bit incompat feature flag is set. * - 0x100 - - \_\_le32 - - s\_default\_mount\_opts + - __le32 + - s_default_mount_opts - Default mount options. See the super_mountopts_ table for more info. * - 0x104 - - \_\_le32 - - s\_first\_meta\_bg - - First metablock block group, if the meta\_bg feature is enabled. + - __le32 + - s_first_meta_bg + - First metablock block group, if the meta_bg feature is enabled. * - 0x108 - - \_\_le32 - - s\_mkfs\_time + - __le32 + - s_mkfs_time - When the filesystem was created, in seconds since the epoch. * - 0x10C - - \_\_le32 - - s\_jnl\_blocks[17] + - __le32 + - s_jnl_blocks[17] - Backup copy of the journal inode's ``i_block[]`` array in the first 15 - elements and i\_size\_high and i\_size in the 16th and 17th elements, + elements and i_size_high and i_size in the 16th and 17th elements, respectively. * - - - - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set. * - 0x150 - - \_\_le32 - - s\_blocks\_count\_hi + - __le32 + - s_blocks_count_hi - High 32-bits of the block count. * - 0x154 - - \_\_le32 - - s\_r\_blocks\_count\_hi + - __le32 + - s_r_blocks_count_hi - High 32-bits of the reserved block count. * - 0x158 - - \_\_le32 - - s\_free\_blocks\_count\_hi + - __le32 + - s_free_blocks_count_hi - High 32-bits of the free block count. * - 0x15C - - \_\_le16 - - s\_min\_extra\_isize + - __le16 + - s_min_extra_isize - All inodes have at least # bytes. * - 0x15E - - \_\_le16 - - s\_want\_extra\_isize + - __le16 + - s_want_extra_isize - New inodes should reserve # bytes. * - 0x160 - - \_\_le32 - - s\_flags + - __le32 + - s_flags - Miscellaneous flags. See the super_flags_ table for more info. * - 0x164 - - \_\_le16 - - s\_raid\_stride + - __le16 + - s_raid_stride - RAID stride. This is the number of logical blocks read from or written to the disk before moving to the next disk. This affects the placement of filesystem metadata, which will hopefully make RAID storage faster. * - 0x166 - - \_\_le16 - - s\_mmp\_interval + - __le16 + - s_mmp_interval - #. seconds to wait in multi-mount prevention (MMP) checking. In theory, MMP is a mechanism to record in the superblock which host and device have mounted the filesystem, in order to prevent multiple mounts. This feature does not seem to be implemented... * - 0x168 - - \_\_le64 - - s\_mmp\_block + - __le64 + - s_mmp_block - Block # for multi-mount protection data. * - 0x170 - - \_\_le32 - - s\_raid\_stripe\_width + - __le32 + - s_raid_stripe_width - RAID stripe width. This is the number of logical blocks read from or written to the disk before coming back to the current disk. This is used by the block allocator to try to reduce the number of read-modify-write operations in a RAID5/6. * - 0x174 - - \_\_u8 - - s\_log\_groups\_per\_flex + - __u8 + - s_log_groups_per_flex - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``. * - 0x175 - - \_\_u8 - - s\_checksum\_type + - __u8 + - s_checksum_type - Metadata checksum algorithm type. The only valid value is 1 (crc32c). * - 0x176 - - \_\_le16 - - s\_reserved\_pad + - __le16 + - s_reserved_pad - * - 0x178 - - \_\_le64 - - s\_kbytes\_written + - __le64 + - s_kbytes_written - Number of KiB written to this filesystem over its lifetime. * - 0x180 - - \_\_le32 - - s\_snapshot\_inum + - __le32 + - s_snapshot_inum - inode number of active snapshot. (Not used in e2fsprogs/Linux.) * - 0x184 - - \_\_le32 - - s\_snapshot\_id + - __le32 + - s_snapshot_id - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.) * - 0x188 - - \_\_le64 - - s\_snapshot\_r\_blocks\_count + - __le64 + - s_snapshot_r_blocks_count - Number of blocks reserved for active snapshot's future use. (Not used in e2fsprogs/Linux.) * - 0x190 - - \_\_le32 - - s\_snapshot\_list + - __le32 + - s_snapshot_list - inode number of the head of the on-disk snapshot list. (Not used in e2fsprogs/Linux.) * - 0x194 - - \_\_le32 - - s\_error\_count + - __le32 + - s_error_count - Number of errors seen. * - 0x198 - - \_\_le32 - - s\_first\_error\_time + - __le32 + - s_first_error_time - First time an error happened, in seconds since the epoch. * - 0x19C - - \_\_le32 - - s\_first\_error\_ino + - __le32 + - s_first_error_ino - inode involved in first error. * - 0x1A0 - - \_\_le64 - - s\_first\_error\_block + - __le64 + - s_first_error_block - Number of block involved of first error. * - 0x1A8 - - \_\_u8 - - s\_first\_error\_func[32] + - __u8 + - s_first_error_func[32] - Name of function where the error happened. * - 0x1C8 - - \_\_le32 - - s\_first\_error\_line + - __le32 + - s_first_error_line - Line number where error happened. * - 0x1CC - - \_\_le32 - - s\_last\_error\_time + - __le32 + - s_last_error_time - Time of most recent error, in seconds since the epoch. * - 0x1D0 - - \_\_le32 - - s\_last\_error\_ino + - __le32 + - s_last_error_ino - inode involved in most recent error. * - 0x1D4 - - \_\_le32 - - s\_last\_error\_line + - __le32 + - s_last_error_line - Line number where most recent error happened. * - 0x1D8 - - \_\_le64 - - s\_last\_error\_block + - __le64 + - s_last_error_block - Number of block involved in most recent error. * - 0x1E0 - - \_\_u8 - - s\_last\_error\_func[32] + - __u8 + - s_last_error_func[32] - Name of function where the most recent error happened. * - 0x200 - - \_\_u8 - - s\_mount\_opts[64] + - __u8 + - s_mount_opts[64] - ASCIIZ string of mount options. * - 0x240 - - \_\_le32 - - s\_usr\_quota\_inum + - __le32 + - s_usr_quota_inum - Inode number of user `quota <quota>`__ file. * - 0x244 - - \_\_le32 - - s\_grp\_quota\_inum + - __le32 + - s_grp_quota_inum - Inode number of group `quota <quota>`__ file. * - 0x248 - - \_\_le32 - - s\_overhead\_blocks + - __le32 + - s_overhead_blocks - Overhead blocks/clusters in fs. (Huh? This field is always zero, which means that the kernel calculates it dynamically.) * - 0x24C - - \_\_le32 - - s\_backup\_bgs[2] - - Block groups containing superblock backups (if sparse\_super2) + - __le32 + - s_backup_bgs[2] + - Block groups containing superblock backups (if sparse_super2) * - 0x254 - - \_\_u8 - - s\_encrypt\_algos[4] + - __u8 + - s_encrypt_algos[4] - Encryption algorithms in use. There can be up to four algorithms in use at any time; valid algorithm codes are given in the super_encrypt_ table below. * - 0x258 - - \_\_u8 - - s\_encrypt\_pw\_salt[16] + - __u8 + - s_encrypt_pw_salt[16] - Salt for the string2key algorithm for encryption. * - 0x268 - - \_\_le32 - - s\_lpf\_ino + - __le32 + - s_lpf_ino - Inode number of lost+found * - 0x26C - - \_\_le32 - - s\_prj\_quota\_inum + - __le32 + - s_prj_quota_inum - Inode that tracks project quotas. * - 0x270 - - \_\_le32 - - s\_checksum\_seed - - Checksum seed used for metadata\_csum calculations. This value is - crc32c(~0, $orig\_fs\_uuid). + - __le32 + - s_checksum_seed + - Checksum seed used for metadata_csum calculations. This value is + crc32c(~0, $orig_fs_uuid). * - 0x274 - - \_\_u8 - - s\_wtime_hi + - __u8 + - s_wtime_hi - Upper 8 bits of the s_wtime field. * - 0x275 - - \_\_u8 - - s\_mtime_hi + - __u8 + - s_mtime_hi - Upper 8 bits of the s_mtime field. * - 0x276 - - \_\_u8 - - s\_mkfs_time_hi + - __u8 + - s_mkfs_time_hi - Upper 8 bits of the s_mkfs_time field. * - 0x277 - - \_\_u8 - - s\_lastcheck_hi + - __u8 + - s_lastcheck_hi - Upper 8 bits of the s_lastcheck_hi field. * - 0x278 - - \_\_u8 - - s\_first_error_time_hi + - __u8 + - s_first_error_time_hi - Upper 8 bits of the s_first_error_time_hi field. * - 0x279 - - \_\_u8 - - s\_last_error_time_hi + - __u8 + - s_last_error_time_hi - Upper 8 bits of the s_last_error_time_hi field. * - 0x27A - - \_\_u8 - - s\_pad[2] + - __u8 + - s_pad[2] - Zero padding. * - 0x27C - - \_\_le16 - - s\_encoding + - __le16 + - s_encoding - Filename charset encoding. * - 0x27E - - \_\_le16 - - s\_encoding_flags + - __le16 + - s_encoding_flags - Filename charset encoding flags. * - 0x280 - - \_\_le32 - - s\_orphan\_file\_inum + - __le32 + - s_orphan_file_inum - Orphan file inode number. * - 0x284 - - \_\_le32 - - s\_reserved[94] + - __le32 + - s_reserved[94] - Padding to the end of the block. * - 0x3FC - - \_\_le32 - - s\_checksum + - __le32 + - s_checksum - Superblock checksum. .. _super_state: @@ -574,44 +574,44 @@ following: * - Value - Description * - 0x1 - - Directory preallocation (COMPAT\_DIR\_PREALLOC). + - Directory preallocation (COMPAT_DIR_PREALLOC). * - 0x2 - “imagic inodes”. Not clear from the code what this does - (COMPAT\_IMAGIC\_INODES). + (COMPAT_IMAGIC_INODES). * - 0x4 - - Has a journal (COMPAT\_HAS\_JOURNAL). + - Has a journal (COMPAT_HAS_JOURNAL). * - 0x8 - - Supports extended attributes (COMPAT\_EXT\_ATTR). + - Supports extended attributes (COMPAT_EXT_ATTR). * - 0x10 - Has reserved GDT blocks for filesystem expansion - (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER. + (COMPAT_RESIZE_INODE). Requires RO_COMPAT_SPARSE_SUPER. * - 0x20 - - Has directory indices (COMPAT\_DIR\_INDEX). + - Has directory indices (COMPAT_DIR_INDEX). * - 0x40 - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized - block groups? (COMPAT\_LAZY\_BG) + block groups? (COMPAT_LAZY_BG) * - 0x80 - - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE). + - “Exclude inode”. Not used. (COMPAT_EXCLUDE_INODE). * - 0x100 - “Exclude bitmap”. Seems to be used to indicate the presence of snapshot-related exclude bitmaps? Not defined in kernel or used in - e2fsprogs (COMPAT\_EXCLUDE\_BITMAP). + e2fsprogs (COMPAT_EXCLUDE_BITMAP). * - 0x200 - - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs + - Sparse Super Block, v2. If this flag is set, the SB field s_backup_bgs points to the two block groups that contain backup superblocks - (COMPAT\_SPARSE\_SUPER2). + (COMPAT_SPARSE_SUPER2). * - 0x400 - Fast commits supported. Although fast commits blocks are backward incompatible, fast commit blocks are not always present in the journal. If fast commit blocks are present in the journal, JBD2 incompat feature - (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) gets - set (COMPAT\_FAST\_COMMIT). + (JBD2_FEATURE_INCOMPAT_FAST_COMMIT) gets + set (COMPAT_FAST_COMMIT). * - 0x1000 - Orphan file allocated. This is the special file for more efficient tracking of unlinked but still open inodes. When there may be any entries in the file, we additionally set proper rocompat feature - (RO\_COMPAT\_ORPHAN\_PRESENT). + (RO_COMPAT_ORPHAN_PRESENT). .. _super_incompat: @@ -625,45 +625,45 @@ following: * - Value - Description * - 0x1 - - Compression (INCOMPAT\_COMPRESSION). + - Compression (INCOMPAT_COMPRESSION). * - 0x2 - - Directory entries record the file type. See ext4\_dir\_entry\_2 below - (INCOMPAT\_FILETYPE). + - Directory entries record the file type. See ext4_dir_entry_2 below + (INCOMPAT_FILETYPE). * - 0x4 - - Filesystem needs recovery (INCOMPAT\_RECOVER). + - Filesystem needs recovery (INCOMPAT_RECOVER). * - 0x8 - - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV). + - Filesystem has a separate journal device (INCOMPAT_JOURNAL_DEV). * - 0x10 - Meta block groups. See the earlier discussion of this feature - (INCOMPAT\_META\_BG). + (INCOMPAT_META_BG). * - 0x40 - - Files in this filesystem use extents (INCOMPAT\_EXTENTS). + - Files in this filesystem use extents (INCOMPAT_EXTENTS). * - 0x80 - - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). + - Enable a filesystem size of 2^64 blocks (INCOMPAT_64BIT). * - 0x100 - - Multiple mount protection (INCOMPAT\_MMP). + - Multiple mount protection (INCOMPAT_MMP). * - 0x200 - Flexible block groups. See the earlier discussion of this feature - (INCOMPAT\_FLEX\_BG). + (INCOMPAT_FLEX_BG). * - 0x400 - Inodes can be used to store large extended attribute values - (INCOMPAT\_EA\_INODE). + (INCOMPAT_EA_INODE). * - 0x1000 - - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?) + - Data in directory entry (INCOMPAT_DIRDATA). (Not implemented?) * - 0x2000 - Metadata checksum seed is stored in the superblock. This feature enables - the administrator to change the UUID of a metadata\_csum filesystem + the administrator to change the UUID of a metadata_csum filesystem while the filesystem is mounted; without it, the checksum definition - requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED). + requires all metadata blocks to be rewritten (INCOMPAT_CSUM_SEED). * - 0x4000 - - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to + - Large directory >2GB or 3-level htree (INCOMPAT_LARGEDIR). Prior to this feature, directories could not be larger than 4GiB and could not have an htree more than 2 levels deep. If this feature is enabled, directories can be larger than 4GiB and have a maximum htree depth of 3. * - 0x8000 - - Data in inode (INCOMPAT\_INLINE\_DATA). + - Data in inode (INCOMPAT_INLINE_DATA). * - 0x10000 - - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT). + - Encrypted inodes are present on the filesystem. (INCOMPAT_ENCRYPT). .. _super_rocompat: @@ -678,54 +678,54 @@ the following: - Description * - 0x1 - Sparse superblocks. See the earlier discussion of this feature - (RO\_COMPAT\_SPARSE\_SUPER). + (RO_COMPAT_SPARSE_SUPER). * - 0x2 - This filesystem has been used to store a file greater than 2GiB - (RO\_COMPAT\_LARGE\_FILE). + (RO_COMPAT_LARGE_FILE). * - 0x4 - - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR). + - Not used in kernel or e2fsprogs (RO_COMPAT_BTREE_DIR). * - 0x8 - This filesystem has files whose sizes are represented in units of logical blocks, not 512-byte sectors. This implies a very large file - indeed! (RO\_COMPAT\_HUGE\_FILE) + indeed! (RO_COMPAT_HUGE_FILE) * - 0x10 - Group descriptors have checksums. In addition to detecting corruption, this is useful for lazy formatting with uninitialized groups - (RO\_COMPAT\_GDT\_CSUM). + (RO_COMPAT_GDT_CSUM). * - 0x20 - Indicates that the old ext3 32,000 subdirectory limit no longer applies - (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1 + (RO_COMPAT_DIR_NLINK). A directory's i_links_count will be set to 1 if it is incremented past 64,999. * - 0x40 - Indicates that large inodes exist on this filesystem - (RO\_COMPAT\_EXTRA\_ISIZE). + (RO_COMPAT_EXTRA_ISIZE). * - 0x80 - - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT). + - This filesystem has a snapshot (RO_COMPAT_HAS_SNAPSHOT). * - 0x100 - - `Quota <Quota>`__ (RO\_COMPAT\_QUOTA). + - `Quota <Quota>`__ (RO_COMPAT_QUOTA). * - 0x200 - This filesystem supports “bigalloc”, which means that file extents are tracked in units of clusters (of blocks) instead of blocks - (RO\_COMPAT\_BIGALLOC). + (RO_COMPAT_BIGALLOC). * - 0x400 - This filesystem supports metadata checksumming. - (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though - GDT\_CSUM must not be set) + (RO_COMPAT_METADATA_CSUM; implies RO_COMPAT_GDT_CSUM, though + GDT_CSUM must not be set) * - 0x800 - Filesystem supports replicas. This feature is neither in the kernel nor - e2fsprogs. (RO\_COMPAT\_REPLICA) + e2fsprogs. (RO_COMPAT_REPLICA) * - 0x1000 - Read-only filesystem image; the kernel will not mount this image read-write and most tools will refuse to write to the image. - (RO\_COMPAT\_READONLY) + (RO_COMPAT_READONLY) * - 0x2000 - - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) + - Filesystem tracks project quotas. (RO_COMPAT_PROJECT) * - 0x8000 - - Verity inodes may be present on the filesystem. (RO\_COMPAT\_VERITY) + - Verity inodes may be present on the filesystem. (RO_COMPAT_VERITY) * - 0x10000 - Indicates orphan file may have valid orphan entries and thus we need to clean them up when mounting the filesystem - (RO\_COMPAT\_ORPHAN\_PRESENT). + (RO_COMPAT_ORPHAN_PRESENT). .. _super_def_hash: @@ -761,36 +761,36 @@ The ``s_default_mount_opts`` field is any combination of the following: * - Value - Description * - 0x0001 - - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG) + - Print debugging info upon (re)mount. (EXT4_DEFM_DEBUG) * - 0x0002 - New files take the gid of the containing directory (instead of the fsgid - of the current process). (EXT4\_DEFM\_BSDGROUPS) + of the current process). (EXT4_DEFM_BSDGROUPS) * - 0x0004 - - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER) + - Support userspace-provided extended attributes. (EXT4_DEFM_XATTR_USER) * - 0x0008 - - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL) + - Support POSIX access control lists (ACLs). (EXT4_DEFM_ACL) * - 0x0010 - - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16) + - Do not support 32-bit UIDs. (EXT4_DEFM_UID16) * - 0x0020 - All data and metadata are commited to the journal. - (EXT4\_DEFM\_JMODE\_DATA) + (EXT4_DEFM_JMODE_DATA) * - 0x0040 - All data are flushed to the disk before metadata are committed to the - journal. (EXT4\_DEFM\_JMODE\_ORDERED) + journal. (EXT4_DEFM_JMODE_ORDERED) * - 0x0060 - Data ordering is not preserved; data may be written after the metadata - has been written. (EXT4\_DEFM\_JMODE\_WBACK) + has been written. (EXT4_DEFM_JMODE_WBACK) * - 0x0100 - - Disable write flushes. (EXT4\_DEFM\_NOBARRIER) + - Disable write flushes. (EXT4_DEFM_NOBARRIER) * - 0x0200 - Track which blocks in a filesystem are metadata and therefore should not be used as data blocks. This option will be enabled by default on 3.18, - hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY) + hopefully. (EXT4_DEFM_BLOCK_VALIDITY) * - 0x0400 - Enable DISCARD support, where the storage device is told about blocks - becoming unused. (EXT4\_DEFM\_DISCARD) + becoming unused. (EXT4_DEFM_DISCARD) * - 0x0800 - - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC) + - Disable delayed allocation. (EXT4_DEFM_NODELALLOC) .. _super_flags: @@ -820,12 +820,12 @@ The ``s_encrypt_algos`` list can contain any of the following: * - Value - Description * - 0 - - Invalid algorithm (ENCRYPTION\_MODE\_INVALID). + - Invalid algorithm (ENCRYPTION_MODE_INVALID). * - 1 - - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS). + - 256-bit AES in XTS mode (ENCRYPTION_MODE_AES_256_XTS). * - 2 - - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM). + - 256-bit AES in GCM mode (ENCRYPTION_MODE_AES_256_GCM). * - 3 - - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC). + - 256-bit AES in CBC mode (ENCRYPTION_MODE_AES_256_CBC). Total size of the superblock is 1024 bytes. diff --git a/MAINTAINERS b/MAINTAINERS index f52543aedd61..6cc825857722 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3662,7 +3662,7 @@ BPF JIT for ARM M: Shubham Bansal <illusionist.neo@gmail.com> L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Odd Fixes F: arch/arm/net/ BPF JIT for ARM64 @@ -3686,14 +3686,15 @@ BPF JIT for NFP NICs M: Jakub Kicinski <kuba@kernel.org> L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Supported +S: Odd Fixes F: drivers/net/ethernet/netronome/nfp/bpf/ BPF JIT for POWERPC (32-BIT AND 64-BIT) M: Naveen N. Rao <naveen.n.rao@linux.ibm.com> +M: Michael Ellerman <mpe@ellerman.id.au> L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Supported F: arch/powerpc/net/ BPF JIT for RISC-V (32-bit) @@ -3719,7 +3720,7 @@ M: Heiko Carstens <hca@linux.ibm.com> M: Vasily Gorbik <gor@linux.ibm.com> L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Supported F: arch/s390/net/ X: arch/s390/net/pnet.c @@ -3727,14 +3728,14 @@ BPF JIT for SPARC (32-BIT AND 64-BIT) M: David S. Miller <davem@davemloft.net> L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Odd Fixes F: arch/sparc/net/ BPF JIT for X86 32-BIT M: Wang YanQing <udknight@gmail.com> L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Odd Fixes F: arch/x86/net/bpf_jit_comp32.c BPF JIT for X86 64-BIT @@ -3757,6 +3758,19 @@ F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c F: security/bpf/ +BPF L7 FRAMEWORK +M: John Fastabend <john.fastabend@gmail.com> +M: Jakub Sitnicki <jakub@cloudflare.com> +L: netdev@vger.kernel.org +L: bpf@vger.kernel.org +S: Maintained +F: include/linux/skmsg.h +F: net/core/skmsg.c +F: net/core/sock_map.c +F: net/ipv4/tcp_bpf.c +F: net/ipv4/udp_bpf.c +F: net/unix/unix_bpf.c + BPFTOOL M: Quentin Monnet <quentin@isovalent.com> L: bpf@vger.kernel.org @@ -9276,6 +9290,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git F: Documentation/devicetree/bindings/i2c/i2c.txt F: Documentation/i2c/ F: drivers/i2c/* +F: include/dt-bindings/i2c/i2c.h F: include/linux/i2c-dev.h F: include/linux/i2c-smbus.h F: include/linux/i2c.h @@ -9291,6 +9306,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git F: Documentation/devicetree/bindings/i2c/ F: drivers/i2c/algos/ F: drivers/i2c/busses/ +F: include/dt-bindings/i2c/ I2C-TAOS-EVM DRIVER M: Jean Delvare <jdelvare@suse.com> @@ -11096,20 +11112,6 @@ S: Maintained F: include/net/l3mdev.h F: net/l3mdev -L7 BPF FRAMEWORK -M: John Fastabend <john.fastabend@gmail.com> -M: Daniel Borkmann <daniel@iogearbox.net> -M: Jakub Sitnicki <jakub@cloudflare.com> -L: netdev@vger.kernel.org -L: bpf@vger.kernel.org -S: Maintained -F: include/linux/skmsg.h -F: net/core/skmsg.c -F: net/core/sock_map.c -F: net/ipv4/tcp_bpf.c -F: net/ipv4/udp_bpf.c -F: net/unix/unix_bpf.c - LANDLOCK SECURITY MODULE M: Mickaël Salaün <mic@digikod.net> L: linux-security-module@vger.kernel.org @@ -13952,7 +13954,6 @@ F: net/ipv6/tcp*.c NETWORKING [TLS] M: Boris Pismenny <borisp@nvidia.com> M: John Fastabend <john.fastabend@gmail.com> -M: Daniel Borkmann <daniel@iogearbox.net> M: Jakub Kicinski <kuba@kernel.org> L: netdev@vger.kernel.org S: Maintained @@ -14869,6 +14870,7 @@ F: include/dt-bindings/ OPENCOMPUTE PTP CLOCK DRIVER M: Jonathan Lemon <jonathan.lemon@gmail.com> +M: Vadim Fedorenko <vadfed@fb.com> L: netdev@vger.kernel.org S: Maintained F: drivers/ptp/ptp_ocp.c @@ -19305,7 +19307,7 @@ R: Andy Shevchenko <andriy.shevchenko@linux.intel.com> R: Mika Westerberg <mika.westerberg@linux.intel.com> R: Jan Dabros <jsd@semihalf.com> L: linux-i2c@vger.kernel.org -S: Maintained +S: Supported F: drivers/i2c/busses/i2c-designware-* SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Superb Owl # *DOCUMENTATION* diff --git a/arch/mips/boot/dts/ingenic/x1000.dtsi b/arch/mips/boot/dts/ingenic/x1000.dtsi index b0a034b468bb..42e69664efd9 100644 --- a/arch/mips/boot/dts/ingenic/x1000.dtsi +++ b/arch/mips/boot/dts/ingenic/x1000.dtsi @@ -111,8 +111,9 @@ clocks = <&cgu X1000_CLK_RTCLK>, <&cgu X1000_CLK_EXCLK>, - <&cgu X1000_CLK_PCLK>; - clock-names = "rtc", "ext", "pclk"; + <&cgu X1000_CLK_PCLK>, + <&cgu X1000_CLK_TCU>; + clock-names = "rtc", "ext", "pclk", "tcu"; interrupt-controller; #interrupt-cells = <1>; diff --git a/arch/mips/boot/dts/ingenic/x1830.dtsi b/arch/mips/boot/dts/ingenic/x1830.dtsi index dbf21afaccb1..65a5da71c199 100644 --- a/arch/mips/boot/dts/ingenic/x1830.dtsi +++ b/arch/mips/boot/dts/ingenic/x1830.dtsi @@ -104,8 +104,9 @@ clocks = <&cgu X1830_CLK_RTCLK>, <&cgu X1830_CLK_EXCLK>, - <&cgu X1830_CLK_PCLK>; - clock-names = "rtc", "ext", "pclk"; + <&cgu X1830_CLK_PCLK>, + <&cgu X1830_CLK_TCU>; + clock-names = "rtc", "ext", "pclk", "tcu"; interrupt-controller; #interrupt-cells = <1>; diff --git a/arch/mips/generic/board-ranchu.c b/arch/mips/generic/board-ranchu.c index a89aaad59cb1..930c45041882 100644 --- a/arch/mips/generic/board-ranchu.c +++ b/arch/mips/generic/board-ranchu.c @@ -44,6 +44,7 @@ static __init unsigned int ranchu_measure_hpt_freq(void) __func__); rtc_base = of_iomap(np, 0); + of_node_put(np); if (!rtc_base) panic("%s(): Failed to ioremap Goldfish RTC base!", __func__); diff --git a/arch/mips/lantiq/falcon/sysctrl.c b/arch/mips/lantiq/falcon/sysctrl.c index 5204fc6d6d50..1187729d8cbb 100644 --- a/arch/mips/lantiq/falcon/sysctrl.c +++ b/arch/mips/lantiq/falcon/sysctrl.c @@ -208,6 +208,12 @@ void __init ltq_soc_init(void) of_address_to_resource(np_sysgpe, 0, &res_sys[2])) panic("Failed to get core resources"); + of_node_put(np_status); + of_node_put(np_ebu); + of_node_put(np_sys1); + of_node_put(np_syseth); + of_node_put(np_sysgpe); + if ((request_mem_region(res_status.start, resource_size(&res_status), res_status.name) < 0) || (request_mem_region(res_ebu.start, resource_size(&res_ebu), diff --git a/arch/mips/lantiq/irq.c b/arch/mips/lantiq/irq.c index b732495f138a..20622bf0a9b3 100644 --- a/arch/mips/lantiq/irq.c +++ b/arch/mips/lantiq/irq.c @@ -408,6 +408,7 @@ int __init icu_of_init(struct device_node *node, struct device_node *parent) if (!ltq_eiu_membase) panic("Failed to remap eiu memory"); } + of_node_put(eiu_node); return 0; } diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c index 084f6caba5f2..d444a1b98a72 100644 --- a/arch/mips/lantiq/xway/sysctrl.c +++ b/arch/mips/lantiq/xway/sysctrl.c @@ -441,6 +441,10 @@ void __init ltq_soc_init(void) of_address_to_resource(np_ebu, 0, &res_ebu)) panic("Failed to get core resources"); + of_node_put(np_pmu); + of_node_put(np_cgu); + of_node_put(np_ebu); + if (!request_mem_region(res_pmu.start, resource_size(&res_pmu), res_pmu.name) || !request_mem_region(res_cgu.start, resource_size(&res_cgu), diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index bbf1e38e1431..2cb708cdf01a 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -214,6 +214,8 @@ static void update_gic_frequency_dt(void) if (of_update_property(node, &gic_frequency_prop) < 0) pr_err("error updating gic frequency property\n"); + + of_node_put(node); } #endif diff --git a/arch/mips/pic32/pic32mzda/init.c b/arch/mips/pic32/pic32mzda/init.c index 129915616763..d9c8c4e46aff 100644 --- a/arch/mips/pic32/pic32mzda/init.c +++ b/arch/mips/pic32/pic32mzda/init.c @@ -98,13 +98,18 @@ static int __init pic32_of_prepare_platform_data(struct of_dev_auxdata *lookup) np = of_find_compatible_node(NULL, NULL, lookup->compatible); if (np) { lookup->name = (char *)np->name; - if (lookup->phys_addr) + if (lookup->phys_addr) { + of_node_put(np); continue; + } if (!of_address_to_resource(np, 0, &res)) lookup->phys_addr = res.start; + of_node_put(np); } } + of_node_put(root); + return 0; } diff --git a/arch/mips/pic32/pic32mzda/time.c b/arch/mips/pic32/pic32mzda/time.c index 7174e9abbb1b..777b515c52c8 100644 --- a/arch/mips/pic32/pic32mzda/time.c +++ b/arch/mips/pic32/pic32mzda/time.c @@ -32,6 +32,9 @@ static unsigned int pic32_xlate_core_timer_irq(void) goto default_map; irq = irq_of_parse_and_map(node, 0); + + of_node_put(node); + if (!irq) goto default_map; diff --git a/arch/mips/ralink/of.c b/arch/mips/ralink/of.c index 587c7b998769..ea8072acf8d9 100644 --- a/arch/mips/ralink/of.c +++ b/arch/mips/ralink/of.c @@ -40,6 +40,8 @@ __iomem void *plat_of_remap_node(const char *node) if (of_address_to_resource(np, 0, &res)) panic("Failed to get resource for %s", node); + of_node_put(np); + if (!request_mem_region(res.start, resource_size(&res), res.name)) diff --git a/arch/mips/vr41xx/common/icu.c b/arch/mips/vr41xx/common/icu.c index 7b7f25b4b057..9240bcdbe74e 100644 --- a/arch/mips/vr41xx/common/icu.c +++ b/arch/mips/vr41xx/common/icu.c @@ -640,8 +640,6 @@ static int icu_get_irq(unsigned int irq) printk(KERN_ERR "spurious ICU interrupt: %04x,%04x\n", pend1, pend2); - atomic_inc(&irq_err_count); - return -1; } diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 03deb4d6920d..928dcf7a20d9 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -124,6 +124,51 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +/* + * The TDX module spec states that #VE may be injected for a limited set of + * reasons: + * + * - Emulation of the architectural #VE injection on EPT violation; + * + * - As a result of guest TD execution of a disallowed instruction, + * a disallowed MSR access, or CPUID virtualization; + * + * - A notification to the guest TD about anomalous behavior; + * + * The last one is opt-in and is not used by the kernel. + * + * The Intel Software Developer's Manual describes cases when instruction + * length field can be used in section "Information for VM Exits Due to + * Instruction Execution". + * + * For TDX, it ultimately means GET_VEINFO provides reliable instruction length + * information if #VE occurred due to instruction execution, but not for EPT + * violations. + */ +static int ve_instr_len(struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_HLT: + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + case EXIT_REASON_CPUID: + case EXIT_REASON_IO_INSTRUCTION: + /* It is safe to use ve->instr_len for #VE due instructions */ + return ve->instr_len; + case EXIT_REASON_EPT_VIOLATION: + /* + * For EPT violations, ve->insn_len is not defined. For those, + * the kernel must decode instructions manually and should not + * be using this function. + */ + WARN_ONCE(1, "ve->instr_len is not defined for EPT violations"); + return 0; + default: + WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason); + return ve->instr_len; + } +} + static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) { struct tdx_hypercall_args args = { @@ -147,7 +192,7 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); } -static bool handle_halt(void) +static int handle_halt(struct ve_info *ve) { /* * Since non safe halt is mainly used in CPU offlining @@ -158,9 +203,9 @@ static bool handle_halt(void) const bool do_sti = false; if (__halt(irq_disabled, do_sti)) - return false; + return -EIO; - return true; + return ve_instr_len(ve); } void __cpuidle tdx_safe_halt(void) @@ -180,7 +225,7 @@ void __cpuidle tdx_safe_halt(void) WARN_ONCE(1, "HLT instruction emulation failed\n"); } -static bool read_msr(struct pt_regs *regs) +static int read_msr(struct pt_regs *regs, struct ve_info *ve) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, @@ -194,14 +239,14 @@ static bool read_msr(struct pt_regs *regs) * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>". */ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) - return false; + return -EIO; regs->ax = lower_32_bits(args.r11); regs->dx = upper_32_bits(args.r11); - return true; + return ve_instr_len(ve); } -static bool write_msr(struct pt_regs *regs) +static int write_msr(struct pt_regs *regs, struct ve_info *ve) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, @@ -215,10 +260,13 @@ static bool write_msr(struct pt_regs *regs) * can be found in TDX Guest-Host-Communication Interface * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>". */ - return !__tdx_hypercall(&args, 0); + if (__tdx_hypercall(&args, 0)) + return -EIO; + + return ve_instr_len(ve); } -static bool handle_cpuid(struct pt_regs *regs) +static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, @@ -236,7 +284,7 @@ static bool handle_cpuid(struct pt_regs *regs) */ if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { regs->ax = regs->bx = regs->cx = regs->dx = 0; - return true; + return ve_instr_len(ve); } /* @@ -245,7 +293,7 @@ static bool handle_cpuid(struct pt_regs *regs) * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>". */ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) - return false; + return -EIO; /* * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of @@ -257,7 +305,7 @@ static bool handle_cpuid(struct pt_regs *regs) regs->cx = args.r14; regs->dx = args.r15; - return true; + return ve_instr_len(ve); } static bool mmio_read(int size, unsigned long addr, unsigned long *val) @@ -283,10 +331,10 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val) EPT_WRITE, addr, val); } -static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) +static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) { + unsigned long *reg, val, vaddr; char buffer[MAX_INSN_SIZE]; - unsigned long *reg, val; struct insn insn = {}; enum mmio_type mmio; int size, extend_size; @@ -294,34 +342,49 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) /* Only in-kernel MMIO is supported */ if (WARN_ON_ONCE(user_mode(regs))) - return false; + return -EFAULT; if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) - return false; + return -EFAULT; if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) - return false; + return -EINVAL; mmio = insn_decode_mmio(&insn, &size); if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) - return false; + return -EINVAL; if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { reg = insn_get_modrm_reg_ptr(&insn, regs); if (!reg) - return false; + return -EINVAL; } - ve->instr_len = insn.length; + /* + * Reject EPT violation #VEs that split pages. + * + * MMIO accesses are supposed to be naturally aligned and therefore + * never cross page boundaries. Seeing split page accesses indicates + * a bug or a load_unaligned_zeropad() that stepped into an MMIO page. + * + * load_unaligned_zeropad() will recover using exception fixups. + */ + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs); + if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) + return -EFAULT; /* Handle writes first */ switch (mmio) { case MMIO_WRITE: memcpy(&val, reg, size); - return mmio_write(size, ve->gpa, val); + if (!mmio_write(size, ve->gpa, val)) + return -EIO; + return insn.length; case MMIO_WRITE_IMM: val = insn.immediate.value; - return mmio_write(size, ve->gpa, val); + if (!mmio_write(size, ve->gpa, val)) + return -EIO; + return insn.length; case MMIO_READ: case MMIO_READ_ZERO_EXTEND: case MMIO_READ_SIGN_EXTEND: @@ -334,15 +397,15 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) * decoded or handled properly. It was likely not using io.h * helpers or accessed MMIO accidentally. */ - return false; + return -EINVAL; default: WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); - return false; + return -EINVAL; } /* Handle reads */ if (!mmio_read(size, ve->gpa, &val)) - return false; + return -EIO; switch (mmio) { case MMIO_READ: @@ -364,13 +427,13 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) default: /* All other cases has to be covered with the first switch() */ WARN_ON_ONCE(1); - return false; + return -EINVAL; } if (extend_size) memset(reg, extend_val, extend_size); memcpy(reg, &val, size); - return true; + return insn.length; } static bool handle_in(struct pt_regs *regs, int size, int port) @@ -421,13 +484,14 @@ static bool handle_out(struct pt_regs *regs, int size, int port) * * Return True on success or False on failure. */ -static bool handle_io(struct pt_regs *regs, u32 exit_qual) +static int handle_io(struct pt_regs *regs, struct ve_info *ve) { + u32 exit_qual = ve->exit_qual; int size, port; - bool in; + bool in, ret; if (VE_IS_IO_STRING(exit_qual)) - return false; + return -EIO; in = VE_IS_IO_IN(exit_qual); size = VE_GET_IO_SIZE(exit_qual); @@ -435,9 +499,13 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual) if (in) - return handle_in(regs, size, port); + ret = handle_in(regs, size, port); else - return handle_out(regs, size, port); + ret = handle_out(regs, size, port); + if (!ret) + return -EIO; + + return ve_instr_len(ve); } /* @@ -447,13 +515,19 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual) __init bool tdx_early_handle_ve(struct pt_regs *regs) { struct ve_info ve; + int insn_len; tdx_get_ve_info(&ve); if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) return false; - return handle_io(regs, ve.exit_qual); + insn_len = handle_io(regs, &ve); + if (insn_len < 0) + return false; + + regs->ip += insn_len; + return true; } void tdx_get_ve_info(struct ve_info *ve) @@ -486,54 +560,65 @@ void tdx_get_ve_info(struct ve_info *ve) ve->instr_info = upper_32_bits(out.r10); } -/* Handle the user initiated #VE */ -static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve) +/* + * Handle the user initiated #VE. + * + * On success, returns the number of bytes RIP should be incremented (>=0) + * or -errno on error. + */ +static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve) { switch (ve->exit_reason) { case EXIT_REASON_CPUID: - return handle_cpuid(regs); + return handle_cpuid(regs, ve); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); - return false; + return -EIO; } } -/* Handle the kernel #VE */ -static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) +/* + * Handle the kernel #VE. + * + * On success, returns the number of bytes RIP should be incremented (>=0) + * or -errno on error. + */ +static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) { switch (ve->exit_reason) { case EXIT_REASON_HLT: - return handle_halt(); + return handle_halt(ve); case EXIT_REASON_MSR_READ: - return read_msr(regs); + return read_msr(regs, ve); case EXIT_REASON_MSR_WRITE: - return write_msr(regs); + return write_msr(regs, ve); case EXIT_REASON_CPUID: - return handle_cpuid(regs); + return handle_cpuid(regs, ve); case EXIT_REASON_EPT_VIOLATION: return handle_mmio(regs, ve); case EXIT_REASON_IO_INSTRUCTION: - return handle_io(regs, ve->exit_qual); + return handle_io(regs, ve); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); - return false; + return -EIO; } } bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) { - bool ret; + int insn_len; if (user_mode(regs)) - ret = virt_exception_user(regs, ve); + insn_len = virt_exception_user(regs, ve); else - ret = virt_exception_kernel(regs, ve); + insn_len = virt_exception_kernel(regs, ve); + if (insn_len < 0) + return false; /* After successful #VE handling, move the IP */ - if (ret) - regs->ip += ve->instr_len; + regs->ip += insn_len; - return ret; + return true; } static bool tdx_tlb_flush_required(bool private) diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 5a39ed59b6db..e8f58ddd06d9 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -4,9 +4,6 @@ #include <asm/e820/types.h> -struct device; -struct resource; - extern struct e820_table *e820_table; extern struct e820_table *e820_table_kexec; extern struct e820_table *e820_table_firmware; @@ -46,8 +43,6 @@ extern void e820__register_nosave_regions(unsigned long limit_pfn); extern int e820__get_entry_type(u64 start, u64 end); -extern void remove_e820_regions(struct device *dev, struct resource *avail); - /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 71943dce691e..9636742a80f2 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -323,7 +323,7 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_get_memory_space_descriptor(phys, desc) \ (__efi64_split(phys), (desc)) -#define __efi64_argmap_set_memory_space_descriptor(phys, size, flags) \ +#define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \ (__efi64_split(phys), __efi64_split(size), __efi64_split(flags)) /* diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index f52a886d35cf..70533fdcbf02 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -69,6 +69,8 @@ void pcibios_scan_specific_bus(int busn); /* pci-irq.c */ +struct pci_dev; + struct irq_info { u8 bus, devfn; /* Bus, device and function */ struct { @@ -246,3 +248,9 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) # define x86_default_pci_init_irq NULL # define x86_default_pci_fixup_irqs NULL #endif + +#if defined(CONFIG_PCI) && defined(CONFIG_ACPI) +extern bool pci_use_e820; +#else +#define pci_use_e820 false +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 7590ac2570b9..f8b9ee97a891 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -108,19 +108,16 @@ extern unsigned long _brk_end; void *extend_brk(size_t size, size_t align); /* - * Reserve space in the brk section. The name must be unique within the file, - * and somewhat descriptive. The size is in bytes. + * Reserve space in the .brk section, which is a block of memory from which the + * caller is allowed to allocate very early (before even memblock is available) + * by calling extend_brk(). All allocated memory will be eventually converted + * to memblock. Any leftover unallocated memory will be freed. * - * The allocation is done using inline asm (rather than using a section - * attribute on a normal variable) in order to allow the use of @nobits, so - * that it doesn't take up any space in the vmlinux file. + * The size is in bytes. */ -#define RESERVE_BRK(name, size) \ - asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \ - ".brk." #name ":\n\t" \ - ".skip " __stringify(size) "\n\t" \ - ".size .brk." #name ", " __stringify(size) "\n\t" \ - ".popsection\n\t") +#define RESERVE_BRK(name, size) \ + __section(".bss..brk") __aligned(1) __used \ + static char __brk_##name[size] extern void probe_roms(void); #ifdef __i386__ @@ -133,12 +130,19 @@ asmlinkage void __init x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ -#else -#define RESERVE_BRK(name,sz) \ - .pushsection .brk_reservation,"aw",@nobits; \ -.brk.name: \ -1: .skip sz; \ - .size .brk.name,.-1b; \ + +#else /* __ASSEMBLY */ + +.macro __RESERVE_BRK name, size + .pushsection .bss..brk, "aw" +SYM_DATA_START(__brk_\name) + .skip \size +SYM_DATA_END(__brk_\name) .popsection +.endm + +#define RESERVE_BRK(name, size) __RESERVE_BRK name, size + #endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_SETUP_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 03364dc40d8d..4c8b6ae802ac 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,10 +36,6 @@ KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD_test_nx.o := y -ifdef CONFIG_FRAME_POINTER -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y -endif - # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 4ec13608d3c6..dfeb227de561 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -175,6 +175,7 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue SYM_FUNC_END(ftrace_caller); +STACK_FRAME_NON_STANDARD_FP(ftrace_caller) SYM_FUNC_START(ftrace_epilogue) /* @@ -282,6 +283,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) +STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -311,10 +313,14 @@ trace: jmp ftrace_stub SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) +STACK_FRAME_NON_STANDARD_FP(__fentry__) + #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_FUNC_START(return_to_handler) +SYM_CODE_START(return_to_handler) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR subq $16, %rsp /* Save the return values */ @@ -339,7 +345,6 @@ SYM_FUNC_START(return_to_handler) int3 .Ldo_rop: mov %rdi, (%rsp) - UNWIND_HINT_FUNC RET -SYM_FUNC_END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index db2b350a37b7..bba1abd05bfe 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/dev_printk.h> #include <linux/ioport.h> +#include <linux/printk.h> #include <asm/e820/api.h> +#include <asm/pci_x86.h> static void resource_clip(struct resource *res, resource_size_t start, resource_size_t end) @@ -24,14 +25,14 @@ static void resource_clip(struct resource *res, resource_size_t start, res->start = end + 1; } -void remove_e820_regions(struct device *dev, struct resource *avail) +static void remove_e820_regions(struct resource *avail) { int i; struct e820_entry *entry; u64 e820_start, e820_end; struct resource orig = *avail; - if (!(avail->flags & IORESOURCE_MEM)) + if (!pci_use_e820) return; for (i = 0; i < e820_table->nr_entries; i++) { @@ -41,7 +42,7 @@ void remove_e820_regions(struct device *dev, struct resource *avail) resource_clip(avail, e820_start, e820_end); if (orig.start != avail->start || orig.end != avail->end) { - dev_info(dev, "clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", + pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", &orig, avail, e820_start, e820_end); orig = *avail; } @@ -55,6 +56,9 @@ void arch_remove_reservations(struct resource *avail) * the low 1MB unconditionally, as this area is needed for some ISA * cards requiring a memory range, e.g. the i82365 PCMCIA controller. */ - if (avail->flags & IORESOURCE_MEM) + if (avail->flags & IORESOURCE_MEM) { resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3ebb85327edb..bd6c6fd373ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -67,11 +67,6 @@ RESERVE_BRK(dmi_alloc, 65536); #endif -/* - * Range of the BSS area. The size of the BSS area is determined - * at link time, with RESERVE_BRK() facility reserving additional - * chunks. - */ unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f5f6dc2e8007..81aba718ecd5 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -385,10 +385,10 @@ SECTIONS __end_of_kernel_reserve = .; . = ALIGN(PAGE_SIZE); - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + .brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ + *(.bss..brk) /* areas brk users have reserved */ __brk_limit = .; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f298b18a9a3d..c98b8c0ed3b8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1420,8 +1420,9 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { + /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ EMIT3_off32(0x48, 0x8B, 0x85, - -(bpf_prog->aux->stack_depth + 8)); + -round_up(bpf_prog->aux->stack_depth, 8) - 8); if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) return -EINVAL; } else { diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index a4f43054bc79..2f82480fd430 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -8,7 +8,6 @@ #include <linux/pci-acpi.h> #include <asm/numa.h> #include <asm/pci_x86.h> -#include <asm/e820/api.h> struct pci_root_info { struct acpi_pci_root_info common; @@ -20,7 +19,7 @@ struct pci_root_info { #endif }; -static bool pci_use_e820 = true; +bool pci_use_e820 = true; static bool pci_use_crs = true; static bool pci_ignore_seg; @@ -387,11 +386,6 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) status = acpi_pci_probe_root_resources(ci); - if (pci_use_e820) { - resource_list_for_each_entry(entry, &ci->resources) - remove_e820_regions(&device->dev, entry->res); - } - if (pci_use_crs) { resource_list_for_each_entry_safe(entry, tmp, &ci->resources) if (resource_is_pcicfg_ioport(entry->res)) diff --git a/certs/Makefile b/certs/Makefile index a8d628fd5f7b..88a73b28d254 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -3,8 +3,8 @@ # Makefile for the linux kernel signature checking certificates. # -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o common.o -obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o common.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o +obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o obj-$(CONFIG_SYSTEM_REVOCATION_LIST) += revocation_certificates.o ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),) diff --git a/certs/blacklist.c b/certs/blacklist.c index 25094ea73600..41f10601cc72 100644 --- a/certs/blacklist.c +++ b/certs/blacklist.c @@ -15,10 +15,9 @@ #include <linux/err.h> #include <linux/seq_file.h> #include <linux/uidgid.h> -#include <linux/verification.h> +#include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include "blacklist.h" -#include "common.h" /* * According to crypto/asymmetric_keys/x509_cert_parser.c:x509_note_pkey_algo(), @@ -365,8 +364,9 @@ static __init int load_revocation_certificate_list(void) if (revocation_certificate_list_size) pr_notice("Loading compiled-in revocation X.509 certificates\n"); - return load_certificate_list(revocation_certificate_list, revocation_certificate_list_size, - blacklist_keyring); + return x509_load_certificate_list(revocation_certificate_list, + revocation_certificate_list_size, + blacklist_keyring); } late_initcall(load_revocation_certificate_list); #endif diff --git a/certs/common.h b/certs/common.h deleted file mode 100644 index abdb5795936b..000000000000 --- a/certs/common.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -#ifndef _CERT_COMMON_H -#define _CERT_COMMON_H - -int load_certificate_list(const u8 cert_list[], const unsigned long list_size, - const struct key *keyring); - -#endif diff --git a/certs/system_keyring.c b/certs/system_keyring.c index 05b66ce9d1c9..5042cc54fa5e 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -16,7 +16,6 @@ #include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include <crypto/pkcs7.h> -#include "common.h" static struct key *builtin_trusted_keys; #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING @@ -183,7 +182,8 @@ __init int load_module_cert(struct key *keyring) pr_notice("Loading compiled-in module X.509 certificates\n"); - return load_certificate_list(system_certificate_list, module_cert_size, keyring); + return x509_load_certificate_list(system_certificate_list, + module_cert_size, keyring); } /* @@ -204,7 +204,7 @@ static __init int load_system_certificate_list(void) size = system_certificate_list_size - module_cert_size; #endif - return load_certificate_list(p, size, builtin_trusted_keys); + return x509_load_certificate_list(p, size, builtin_trusted_keys); } late_initcall(load_system_certificate_list); diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig index 460bc5d0a828..3df3fe4ed95f 100644 --- a/crypto/asymmetric_keys/Kconfig +++ b/crypto/asymmetric_keys/Kconfig @@ -75,4 +75,14 @@ config SIGNED_PE_FILE_VERIFICATION This option provides support for verifying the signature(s) on a signed PE binary. +config FIPS_SIGNATURE_SELFTEST + bool "Run FIPS selftests on the X.509+PKCS7 signature verification" + help + This option causes some selftests to be run on the signature + verification code, using some built in data. This is required + for FIPS. + depends on KEYS + depends on ASYMMETRIC_KEY_TYPE + depends on PKCS7_MESSAGE_PARSER + endif # ASYMMETRIC_KEY_TYPE diff --git a/crypto/asymmetric_keys/Makefile b/crypto/asymmetric_keys/Makefile index c38424f55b08..0d1fa1b692c6 100644 --- a/crypto/asymmetric_keys/Makefile +++ b/crypto/asymmetric_keys/Makefile @@ -20,7 +20,9 @@ x509_key_parser-y := \ x509.asn1.o \ x509_akid.asn1.o \ x509_cert_parser.o \ + x509_loader.o \ x509_public_key.o +x509_key_parser-$(CONFIG_FIPS_SIGNATURE_SELFTEST) += selftest.o $(obj)/x509_cert_parser.o: \ $(obj)/x509.asn1.h \ diff --git a/crypto/asymmetric_keys/selftest.c b/crypto/asymmetric_keys/selftest.c new file mode 100644 index 000000000000..fa0bf7f24284 --- /dev/null +++ b/crypto/asymmetric_keys/selftest.c @@ -0,0 +1,224 @@ +/* Self-testing for signature checking. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/cred.h> +#include <linux/key.h> +#include <crypto/pkcs7.h> +#include "x509_parser.h" + +struct certs_test { + const u8 *data; + size_t data_len; + const u8 *pkcs7; + size_t pkcs7_len; +}; + +/* + * Set of X.509 certificates to provide public keys for the tests. These will + * be loaded into a temporary keyring for the duration of the testing. + */ +static const __initconst u8 certs_selftest_keys[] = { + "\x30\x82\x05\x55\x30\x82\x03\x3d\xa0\x03\x02\x01\x02\x02\x14\x73" + "\x98\xea\x98\x2d\xd0\x2e\xa8\xb1\xcf\x57\xc7\xf2\x97\xb3\xe6\x1a" + "\xfc\x8c\x0a\x30\x0d\x06\x09\x2a\x86\x48\x86\xf7\x0d\x01\x01\x0b" + "\x05\x00\x30\x34\x31\x32\x30\x30\x06\x03\x55\x04\x03\x0c\x29\x43" + "\x65\x72\x74\x69\x66\x69\x63\x61\x74\x65\x20\x76\x65\x72\x69\x66" + "\x69\x63\x61\x74\x69\x6f\x6e\x20\x73\x65\x6c\x66\x2d\x74\x65\x73" + "\x74\x69\x6e\x67\x20\x6b\x65\x79\x30\x20\x17\x0d\x32\x32\x30\x35" + "\x31\x38\x32\x32\x33\x32\x34\x31\x5a\x18\x0f\x32\x31\x32\x32\x30" + "\x34\x32\x34\x32\x32\x33\x32\x34\x31\x5a\x30\x34\x31\x32\x30\x30" + "\x06\x03\x55\x04\x03\x0c\x29\x43\x65\x72\x74\x69\x66\x69\x63\x61" + "\x74\x65\x20\x76\x65\x72\x69\x66\x69\x63\x61\x74\x69\x6f\x6e\x20" + "\x73\x65\x6c\x66\x2d\x74\x65\x73\x74\x69\x6e\x67\x20\x6b\x65\x79" + "\x30\x82\x02\x22\x30\x0d\x06\x09\x2a\x86\x48\x86\xf7\x0d\x01\x01" + "\x01\x05\x00\x03\x82\x02\x0f\x00\x30\x82\x02\x0a\x02\x82\x02\x01" + "\x00\xcc\xac\x49\xdd\x3b\xca\xb0\x15\x7e\x84\x6a\xb2\x0a\x69\x5f" + "\x1c\x0a\x61\x82\x3b\x4f\x2c\xa3\x95\x2c\x08\x58\x4b\xb1\x5d\x99" + "\xe0\xc3\xc1\x79\xc2\xb3\xeb\xc0\x1e\x6d\x3e\x54\x1d\xbd\xb7\x92" + "\x7b\x4d\xb5\x95\x58\xb2\x52\x2e\xc6\x24\x4b\x71\x63\x80\x32\x77" + "\xa7\x38\x5e\xdb\x72\xae\x6e\x0d\xec\xfb\xb6\x6d\x01\x7f\xe9\x55" + "\x66\xdf\xbf\x1d\x76\x78\x02\x31\xe8\xe5\x07\xf8\xb7\x82\x5c\x0d" + "\xd4\xbb\xfb\xa2\x59\x0d\x2e\x3a\x78\x95\x3a\x8b\x46\x06\x47\x44" + "\x46\xd7\xcd\x06\x6a\x41\x13\xe3\x19\xf6\xbb\x6e\x38\xf4\x83\x01" + "\xa3\xbf\x4a\x39\x4f\xd7\x0a\xe9\x38\xb3\xf5\x94\x14\x4e\xdd\xf7" + "\x43\xfd\x24\xb2\x49\x3c\xa5\xf7\x7a\x7c\xd4\x45\x3d\x97\x75\x68" + "\xf1\xed\x4c\x42\x0b\x70\xca\x85\xf3\xde\xe5\x88\x2c\xc5\xbe\xb6" + "\x97\x34\xba\x24\x02\xcd\x8b\x86\x9f\xa9\x73\xca\x73\xcf\x92\x81" + "\xee\x75\x55\xbb\x18\x67\x5c\xff\x3f\xb5\xdd\x33\x1b\x0c\xe9\x78" + "\xdb\x5c\xcf\xaa\x5c\x43\x42\xdf\x5e\xa9\x6d\xec\xd7\xd7\xff\xe6" + "\xa1\x3a\x92\x1a\xda\xae\xf6\x8c\x6f\x7b\xd5\xb4\x6e\x06\xe9\x8f" + "\xe8\xde\x09\x31\x89\xed\x0e\x11\xa1\xfa\x8a\xe9\xe9\x64\x59\x62" + "\x53\xda\xd1\x70\xbe\x11\xd4\x99\x97\x11\xcf\x99\xde\x0b\x9d\x94" + "\x7e\xaa\xb8\x52\xea\x37\xdb\x90\x7e\x35\xbd\xd9\xfe\x6d\x0a\x48" + "\x70\x28\xdd\xd5\x0d\x7f\x03\x80\x93\x14\x23\x8f\xb9\x22\xcd\x7c" + "\x29\xfe\xf1\x72\xb5\x5c\x0b\x12\xcf\x9c\x15\xf6\x11\x4c\x7a\x45" + "\x25\x8c\x45\x0a\x34\xac\x2d\x9a\x81\xca\x0b\x13\x22\xcd\xeb\x1a" + "\x38\x88\x18\x97\x96\x08\x81\xaa\xcc\x8f\x0f\x8a\x32\x7b\x76\x68" + "\x03\x68\x43\xbf\x11\xba\x55\x60\xfd\x80\x1c\x0d\x9b\x69\xb6\x09" + "\x72\xbc\x0f\x41\x2f\x07\x82\xc6\xe3\xb2\x13\x91\xc4\x6d\x14\x95" + "\x31\xbe\x19\xbd\xbc\xed\xe1\x4c\x74\xa2\xe0\x78\x0b\xbb\x94\xec" + "\x4c\x53\x3a\xa2\xb5\x84\x1d\x4b\x65\x7e\xdc\xf7\xdb\x36\x7d\xbe" + "\x9e\x3b\x36\x66\x42\x66\x76\x35\xbf\xbe\xf0\xc1\x3c\x7c\xe9\x42" + "\x5c\x24\x53\x03\x05\xa8\x67\x24\x50\x02\x75\xff\x24\x46\x3b\x35" + "\x89\x76\xe6\x70\xda\xc5\x51\x8c\x9a\xe5\x05\xb0\x0b\xd0\x2d\xd4" + "\x7d\x57\x75\x94\x6b\xf9\x0a\xad\x0e\x41\x00\x15\xd0\x4f\xc0\x7f" + "\x90\x2d\x18\x48\x8f\x28\xfe\x5d\xa7\xcd\x99\x9e\xbd\x02\x6c\x8a" + "\x31\xf3\x1c\xc7\x4b\xe6\x93\xcd\x42\xa2\xe4\x68\x10\x47\x9d\xfc" + "\x21\x02\x03\x01\x00\x01\xa3\x5d\x30\x5b\x30\x0c\x06\x03\x55\x1d" + "\x13\x01\x01\xff\x04\x02\x30\x00\x30\x0b\x06\x03\x55\x1d\x0f\x04" + "\x04\x03\x02\x07\x80\x30\x1d\x06\x03\x55\x1d\x0e\x04\x16\x04\x14" + "\xf5\x87\x03\xbb\x33\xce\x1b\x73\xee\x02\xec\xcd\xee\x5b\x88\x17" + "\x51\x8f\xe3\xdb\x30\x1f\x06\x03\x55\x1d\x23\x04\x18\x30\x16\x80" + "\x14\xf5\x87\x03\xbb\x33\xce\x1b\x73\xee\x02\xec\xcd\xee\x5b\x88" + "\x17\x51\x8f\xe3\xdb\x30\x0d\x06\x09\x2a\x86\x48\x86\xf7\x0d\x01" + "\x01\x0b\x05\x00\x03\x82\x02\x01\x00\xc0\x2e\x12\x41\x7b\x73\x85" + "\x16\xc8\xdb\x86\x79\xe8\xf5\xcd\x44\xf4\xc6\xe2\x81\x23\x5e\x47" + "\xcb\xab\x25\xf1\x1e\x58\x3e\x31\x7f\x78\xad\x85\xeb\xfe\x14\x88" + "\x60\xf7\x7f\xd2\x26\xa2\xf4\x98\x2a\xfd\xba\x05\x0c\x20\x33\x12" + "\xcc\x4d\x14\x61\x64\x81\x93\xd3\x33\xed\xc8\xff\xf1\x78\xcc\x5f" + "\x51\x9f\x09\xd7\xbe\x0d\x5c\x74\xfd\x9b\xdf\x52\x4a\xc9\xa8\x71" + "\x25\x33\x04\x10\x67\x36\xd0\xb3\x0b\xc9\xa1\x40\x72\xae\x41\x7b" + "\x68\xe6\xe4\x7b\xd0\x28\xf7\x6d\xe7\x3f\x50\xfc\x91\x7c\x91\x56" + "\xd4\xdf\xa6\xbb\xe8\x4d\x1b\x58\xaa\x28\xfa\xc1\x19\xeb\x11\x2f" + "\x24\x8b\x7c\xc5\xa9\x86\x26\xaa\x6e\xb7\x9b\xd5\xf8\x06\xfb\x02" + "\x52\x7b\x9c\x9e\xa1\xe0\x07\x8b\x5e\xe4\xb8\x55\x29\xf6\x48\x52" + "\x1c\x1b\x54\x2d\x46\xd8\xe5\x71\xb9\x60\xd1\x45\xb5\x92\x89\x8a" + "\x63\x58\x2a\xb3\xc6\xb2\x76\xe2\x3c\x82\x59\x04\xae\x5a\xc4\x99" + "\x7b\x2e\x4b\x46\x57\xb8\x29\x24\xb2\xfd\xee\x2c\x0d\xa4\x83\xfa" + "\x65\x2a\x07\x35\x8b\x97\xcf\xbd\x96\x2e\xd1\x7e\x6c\xc2\x1e\x87" + "\xb6\x6c\x76\x65\xb5\xb2\x62\xda\x8b\xe9\x73\xe3\xdb\x33\xdd\x13" + "\x3a\x17\x63\x6a\x76\xde\x8d\x8f\xe0\x47\x61\x28\x3a\x83\xff\x8f" + "\xe7\xc7\xe0\x4a\xa3\xe5\x07\xcf\xe9\x8c\x35\x35\x2e\xe7\x80\x66" + "\x31\xbf\x91\x58\x0a\xe1\x25\x3d\x38\xd3\xa4\xf0\x59\x34\x47\x07" + "\x62\x0f\xbe\x30\xdd\x81\x88\x58\xf0\x28\xb0\x96\xe5\x82\xf8\x05" + "\xb7\x13\x01\xbc\xfa\xc6\x1f\x86\x72\xcc\xf9\xee\x8e\xd9\xd6\x04" + "\x8c\x24\x6c\xbf\x0f\x5d\x37\x39\xcf\x45\xc1\x93\x3a\xd2\xed\x5c" + "\x58\x79\x74\x86\x62\x30\x7e\x8e\xbb\xdd\x7a\xa9\xed\xca\x40\xcb" + "\x62\x47\xf4\xb4\x9f\x52\x7f\x72\x63\xa8\xf0\x2b\xaf\x45\x2a\x48" + "\x19\x6d\xe3\xfb\xf9\x19\x66\x69\xc8\xcc\x62\x87\x6c\x53\x2b\x2d" + "\x6e\x90\x6c\x54\x3a\x82\x25\x41\xcb\x18\x6a\xa4\x22\xa8\xa1\xc4" + "\x47\xd7\x81\x00\x1c\x15\x51\x0f\x1a\xaf\xef\x9f\xa6\x61\x8c\xbd" + "\x6b\x8b\xed\xe6\xac\x0e\xb6\x3a\x4c\x92\xe6\x0f\x91\x0a\x0f\x71" + "\xc7\xa0\xb9\x0d\x3a\x17\x5a\x6f\x35\xc8\xe7\x50\x4f\x46\xe8\x70" + "\x60\x48\x06\x82\x8b\x66\x58\xe6\x73\x91\x9c\x12\x3d\x35\x8e\x46" + "\xad\x5a\xf5\xb3\xdb\x69\x21\x04\xfd\xd3\x1c\xdf\x94\x9d\x56\xb0" + "\x0a\xd1\x95\x76\x8d\xec\x9e\xdd\x0b\x15\x97\x64\xad\xe5\xf2\x62" + "\x02\xfc\x9e\x5f\x56\x42\x39\x05\xb3" +}; + +/* + * Signed data and detached signature blobs that form the verification tests. + */ +static const __initconst u8 certs_selftest_1_data[] = { + "\x54\x68\x69\x73\x20\x69\x73\x20\x73\x6f\x6d\x65\x20\x74\x65\x73" + "\x74\x20\x64\x61\x74\x61\x20\x75\x73\x65\x64\x20\x66\x6f\x72\x20" + "\x73\x65\x6c\x66\x2d\x74\x65\x73\x74\x69\x6e\x67\x20\x63\x65\x72" + "\x74\x69\x66\x69\x63\x61\x74\x65\x20\x76\x65\x72\x69\x66\x69\x63" + "\x61\x74\x69\x6f\x6e\x2e\x0a" +}; + +static const __initconst u8 certs_selftest_1_pkcs7[] = { + "\x30\x82\x02\xab\x06\x09\x2a\x86\x48\x86\xf7\x0d\x01\x07\x02\xa0" + "\x82\x02\x9c\x30\x82\x02\x98\x02\x01\x01\x31\x0d\x30\x0b\x06\x09" + "\x60\x86\x48\x01\x65\x03\x04\x02\x01\x30\x0b\x06\x09\x2a\x86\x48" + "\x86\xf7\x0d\x01\x07\x01\x31\x82\x02\x75\x30\x82\x02\x71\x02\x01" + "\x01\x30\x4c\x30\x34\x31\x32\x30\x30\x06\x03\x55\x04\x03\x0c\x29" + "\x43\x65\x72\x74\x69\x66\x69\x63\x61\x74\x65\x20\x76\x65\x72\x69" + "\x66\x69\x63\x61\x74\x69\x6f\x6e\x20\x73\x65\x6c\x66\x2d\x74\x65" + "\x73\x74\x69\x6e\x67\x20\x6b\x65\x79\x02\x14\x73\x98\xea\x98\x2d" + "\xd0\x2e\xa8\xb1\xcf\x57\xc7\xf2\x97\xb3\xe6\x1a\xfc\x8c\x0a\x30" + "\x0b\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x30\x0d\x06\x09" + "\x2a\x86\x48\x86\xf7\x0d\x01\x01\x01\x05\x00\x04\x82\x02\x00\xac" + "\xb0\xf2\x07\xd6\x99\x6d\xc0\xc0\xd9\x8d\x31\x0d\x7e\x04\xeb\xc3" + "\x88\x90\xc4\x58\x46\xd4\xe2\xa0\xa3\x25\xe3\x04\x50\x37\x85\x8c" + "\x91\xc6\xfc\xc5\xd4\x92\xfd\x05\xd8\xb8\xa3\xb8\xba\x89\x13\x00" + "\x88\x79\x99\x51\x6b\x5b\x28\x31\xc0\xb3\x1b\x7a\x68\x2c\x00\xdb" + "\x4b\x46\x11\xf3\xfa\x50\x8e\x19\x89\xa2\x4c\xda\x4c\x89\x01\x11" + "\x89\xee\xd3\xc8\xc1\xe7\xa7\xf6\xb2\xa2\xf8\x65\xb8\x35\x20\x33" + "\xba\x12\x62\xd5\xbd\xaa\x71\xe5\x5b\xc0\x6a\x32\xff\x6a\x2e\x23" + "\xef\x2b\xb6\x58\xb1\xfb\x5f\x82\x34\x40\x6d\x9f\xbc\x27\xac\x37" + "\x23\x99\xcf\x7d\x20\xb2\x39\x01\xc0\x12\xce\xd7\x5d\x2f\xb6\xab" + "\xb5\x56\x4f\xef\xf4\x72\x07\x58\x65\xa9\xeb\x1f\x75\x1c\x5f\x0c" + "\x88\xe0\xa4\xe2\xcd\x73\x2b\x9e\xb2\x05\x7e\x12\xf8\xd0\x66\x41" + "\xcc\x12\x63\xd4\xd6\xac\x9b\x1d\x14\x77\x8d\x1c\x57\xd5\x27\xc6" + "\x49\xa2\x41\x43\xf3\x59\x29\xe5\xcb\xd1\x75\xbc\x3a\x97\x2a\x72" + "\x22\x66\xc5\x3b\xc1\xba\xfc\x53\x18\x98\xe2\x21\x64\xc6\x52\x87" + "\x13\xd5\x7c\x42\xe8\xfb\x9c\x9a\x45\x32\xd5\xa5\x22\x62\x9d\xd4" + "\xcb\xa4\xfa\x77\xbb\x50\x24\x0b\x8b\x88\x99\x15\x56\xa9\x1e\x92" + "\xbf\x5d\x94\x77\xb6\xf1\x67\x01\x60\x06\x58\x5c\xdf\x18\x52\x79" + "\x37\x30\x93\x7d\x87\x04\xf1\xe0\x55\x59\x52\xf3\xc2\xb1\x1c\x5b" + "\x12\x7c\x49\x87\xfb\xf7\xed\xdd\x95\x71\xec\x4b\x1a\x85\x08\xb0" + "\xa0\x36\xc4\x7b\xab\x40\xe0\xf1\x98\xcc\xaf\x19\x40\x8f\x47\x6f" + "\xf0\x6c\x84\x29\x7f\x7f\x04\x46\xcb\x08\x0f\xe0\xc1\xc9\x70\x6e" + "\x95\x3b\xa4\xbc\x29\x2b\x53\x67\x45\x1b\x0d\xbc\x13\xa5\x76\x31" + "\xaf\xb9\xd0\xe0\x60\x12\xd2\xf4\xb7\x7c\x58\x7e\xf6\x2d\xbb\x24" + "\x14\x5a\x20\x24\xa8\x12\xdf\x25\xbd\x42\xce\x96\x7c\x2e\xba\x14" + "\x1b\x81\x9f\x18\x45\xa4\xc6\x70\x3e\x0e\xf0\xd3\x7b\x9c\x10\xbe" + "\xb8\x7a\x89\xc5\x9e\xd9\x97\xdf\xd7\xe7\xc6\x1d\xc0\x20\x6c\xb8" + "\x1e\x3a\x63\xb8\x39\x8e\x8e\x62\xd5\xd2\xb4\xcd\xff\x46\xfc\x8e" + "\xec\x07\x35\x0c\xff\xb0\x05\xe6\xf4\xe5\xfe\xa2\xe3\x0a\xe6\x36" + "\xa7\x4a\x7e\x62\x1d\xc4\x50\x39\x35\x4e\x28\xcb\x4a\xfb\x9d\xdb" + "\xdd\x23\xd6\x53\xb1\x74\x77\x12\xf7\x9c\xf0\x9a\x6b\xf7\xa9\x64" + "\x2d\x86\x21\x2a\xcf\xc6\x54\xf5\xc9\xad\xfa\xb5\x12\xb4\xf3\x51" + "\x77\x55\x3c\x6f\x0c\x32\xd3\x8c\x44\x39\x71\x25\xfe\x96\xd2" +}; + +/* + * List of tests to be run. + */ +#define TEST(data, pkcs7) { data, sizeof(data) - 1, pkcs7, sizeof(pkcs7) - 1 } +static const struct certs_test certs_tests[] __initconst = { + TEST(certs_selftest_1_data, certs_selftest_1_pkcs7), +}; + +int __init fips_signature_selftest(void) +{ + struct key *keyring; + int ret, i; + + pr_notice("Running certificate verification selftests\n"); + + keyring = keyring_alloc(".certs_selftest", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | + KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(keyring)) + panic("Can't allocate certs selftest keyring: %ld\n", + PTR_ERR(keyring)); + + ret = x509_load_certificate_list(certs_selftest_keys, + sizeof(certs_selftest_keys) - 1, keyring); + if (ret < 0) + panic("Can't allocate certs selftest keyring: %d\n", ret); + + for (i = 0; i < ARRAY_SIZE(certs_tests); i++) { + const struct certs_test *test = &certs_tests[i]; + struct pkcs7_message *pkcs7; + + pkcs7 = pkcs7_parse_message(test->pkcs7, test->pkcs7_len); + if (IS_ERR(pkcs7)) + panic("Certs selftest %d: pkcs7_parse_message() = %d\n", i, ret); + + pkcs7_supply_detached_data(pkcs7, test->data, test->data_len); + + ret = pkcs7_verify(pkcs7, VERIFYING_MODULE_SIGNATURE); + if (ret < 0) + panic("Certs selftest %d: pkcs7_verify() = %d\n", i, ret); + + ret = pkcs7_validate_trust(pkcs7, keyring); + if (ret < 0) + panic("Certs selftest %d: pkcs7_validate_trust() = %d\n", i, ret); + + pkcs7_free_message(pkcs7); + } + + key_put(keyring); + return 0; +} diff --git a/certs/common.c b/crypto/asymmetric_keys/x509_loader.c index 16a220887a53..1bc169dee22e 100644 --- a/certs/common.c +++ b/crypto/asymmetric_keys/x509_loader.c @@ -2,11 +2,11 @@ #include <linux/kernel.h> #include <linux/key.h> -#include "common.h" +#include <keys/asymmetric-type.h> -int load_certificate_list(const u8 cert_list[], - const unsigned long list_size, - const struct key *keyring) +int x509_load_certificate_list(const u8 cert_list[], + const unsigned long list_size, + const struct key *keyring) { key_ref_t key; const u8 *p, *end; diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h index 97a886cbe01c..a299c9c56f40 100644 --- a/crypto/asymmetric_keys/x509_parser.h +++ b/crypto/asymmetric_keys/x509_parser.h @@ -41,6 +41,15 @@ struct x509_certificate { }; /* + * selftest.c + */ +#ifdef CONFIG_FIPS_SIGNATURE_SELFTEST +extern int __init fips_signature_selftest(void); +#else +static inline int fips_signature_selftest(void) { return 0; } +#endif + +/* * x509_cert_parser.c */ extern void x509_free_certificate(struct x509_certificate *cert); diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 77ed4e93ad56..0b4943a4592b 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -244,9 +244,15 @@ static struct asymmetric_key_parser x509_key_parser = { /* * Module stuff */ +extern int __init certs_selftest(void); static int __init x509_key_init(void) { - return register_asymmetric_key_parser(&x509_key_parser); + int ret; + + ret = register_asymmetric_key_parser(&x509_key_parser); + if (ret < 0) + return ret; + return fips_signature_selftest(); } static void __exit x509_key_exit(void) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index e81a9700cfd0..6143dbf31f31 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -1239,14 +1239,14 @@ error_cleanup_mc_io: static int fsl_mc_bus_remove(struct platform_device *pdev) { struct fsl_mc *mc = platform_get_drvdata(pdev); + struct fsl_mc_io *mc_io; if (!fsl_mc_is_root_dprc(&mc->root_mc_bus_dev->dev)) return -EINVAL; + mc_io = mc->root_mc_bus_dev->mc_io; fsl_mc_device_remove(mc->root_mc_bus_dev); - - fsl_destroy_mc_io(mc->root_mc_bus_dev->mc_io); - mc->root_mc_bus_dev->mc_io = NULL; + fsl_destroy_mc_io(mc_io); bus_unregister_notifier(&fsl_mc_bus_type, &fsl_mc_nb); diff --git a/drivers/char/random.c b/drivers/char/random.c index 655e327d425e..e3dd1dd3dd22 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -87,7 +87,7 @@ static struct fasync_struct *fasync; /* Control how we warn userspace. */ static struct ratelimit_state urandom_warning = - RATELIMIT_STATE_INIT("warn_urandom_randomness", HZ, 3); + RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE); static int ratelimit_disable __read_mostly = IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM); module_param_named(ratelimit_disable, ratelimit_disable, int, 0644); @@ -408,7 +408,7 @@ static ssize_t get_random_bytes_user(struct iov_iter *iter) /* * Immediately overwrite the ChaCha key at index 4 with random - * bytes, in case userspace causes copy_to_user() below to sleep + * bytes, in case userspace causes copy_to_iter() below to sleep * forever, so that we still retain forward secrecy in that case. */ crng_make_state(chacha_state, (u8 *)&chacha_state[4], CHACHA_KEY_SIZE); @@ -1009,7 +1009,7 @@ void add_interrupt_randomness(int irq) if (new_count & MIX_INFLIGHT) return; - if (new_count < 64 && !time_is_before_jiffies(fast_pool->last + HZ)) + if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ)) return; if (unlikely(!fast_pool->mix.func)) diff --git a/drivers/comedi/drivers/vmk80xx.c b/drivers/comedi/drivers/vmk80xx.c index 46023adc5395..4536ed43f65b 100644 --- a/drivers/comedi/drivers/vmk80xx.c +++ b/drivers/comedi/drivers/vmk80xx.c @@ -684,7 +684,7 @@ static int vmk80xx_alloc_usb_buffers(struct comedi_device *dev) if (!devpriv->usb_rx_buf) return -ENOMEM; - size = max(usb_endpoint_maxp(devpriv->ep_rx), MIN_BUF_SIZE); + size = max(usb_endpoint_maxp(devpriv->ep_tx), MIN_BUF_SIZE); devpriv->usb_tx_buf = kzalloc(size, GFP_KERNEL); if (!devpriv->usb_tx_buf) return -ENOMEM; diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index e7330684d3b8..9631f2fd2faf 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -32,8 +32,11 @@ static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct udmabuf *ubuf = vma->vm_private_data; + pgoff_t pgoff = vmf->pgoff; - vmf->page = ubuf->pages[vmf->pgoff]; + if (pgoff >= ubuf->pagecount) + return VM_FAULT_SIGBUS; + vmf->page = ubuf->pages[pgoff]; get_page(vmf->page); return 0; } diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index c9fe5903725a..9c89f7d53e99 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1211,7 +1211,7 @@ static int ioctl_get_cycle_timer2(struct client *client, union ioctl_arg *arg) struct fw_cdev_get_cycle_timer2 *a = &arg->get_cycle_timer2; struct fw_card *card = client->device->card; struct timespec64 ts = {0, 0}; - u32 cycle_time; + u32 cycle_time = 0; int ret = 0; local_irq_disable(); diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 90ed8fdaba75..adddd8c45d0c 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -372,8 +372,7 @@ static ssize_t rom_index_show(struct device *dev, struct fw_device *device = fw_device(dev->parent); struct fw_unit *unit = fw_unit(dev); - return snprintf(buf, PAGE_SIZE, "%d\n", - (int)(unit->directory - device->config_rom)); + return sysfs_emit(buf, "%td\n", unit->directory - device->config_rom); } static struct device_attribute fw_unit_attributes[] = { @@ -403,8 +402,7 @@ static ssize_t guid_show(struct device *dev, int ret; down_read(&fw_device_rwsem); - ret = snprintf(buf, PAGE_SIZE, "0x%08x%08x\n", - device->config_rom[3], device->config_rom[4]); + ret = sysfs_emit(buf, "0x%08x%08x\n", device->config_rom[3], device->config_rom[4]); up_read(&fw_device_rwsem); return ret; diff --git a/drivers/firmware/efi/sysfb_efi.c b/drivers/firmware/efi/sysfb_efi.c index 4c7c9dd7733f..7882d4b3f2be 100644 --- a/drivers/firmware/efi/sysfb_efi.c +++ b/drivers/firmware/efi/sysfb_efi.c @@ -26,8 +26,6 @@ #include <linux/sysfb.h> #include <video/vga.h> -#include <asm/efi.h> - enum { OVERRIDE_NONE = 0x0, OVERRIDE_BASE = 0x1, diff --git a/drivers/gpio/gpio-vr41xx.c b/drivers/gpio/gpio-vr41xx.c index 98cd715ccc33..8d09b619c166 100644 --- a/drivers/gpio/gpio-vr41xx.c +++ b/drivers/gpio/gpio-vr41xx.c @@ -217,8 +217,6 @@ static int giu_get_irq(unsigned int irq) printk(KERN_ERR "spurious GIU interrupt: %04x(%04x),%04x(%04x)\n", maskl, pendl, maskh, pendh); - atomic_inc(&irq_err_count); - return -EINVAL; } diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index e7d316b1401a..c023b691441e 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -477,9 +477,6 @@ int i2c_dw_prepare_clk(struct dw_i2c_dev *dev, bool prepare) { int ret; - if (IS_ERR(dev->clk)) - return PTR_ERR(dev->clk); - if (prepare) { /* Optional interface clock */ ret = clk_prepare_enable(dev->pclk); diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 70ade5306e45..ba043b547393 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -320,8 +320,17 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) goto exit_reset; } - dev->clk = devm_clk_get(&pdev->dev, NULL); - if (!i2c_dw_prepare_clk(dev, true)) { + dev->clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(dev->clk)) { + ret = PTR_ERR(dev->clk); + goto exit_reset; + } + + ret = i2c_dw_prepare_clk(dev, true); + if (ret) + goto exit_reset; + + if (dev->clk) { u64 clk_khz; dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz; diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index bdecb78bfc26..8e6985354fd5 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -1420,17 +1420,22 @@ static int mtk_i2c_probe(struct platform_device *pdev) if (ret < 0) { dev_err(&pdev->dev, "Request I2C IRQ %d fail\n", irq); - return ret; + goto err_bulk_unprepare; } i2c_set_adapdata(&i2c->adap, i2c); ret = i2c_add_adapter(&i2c->adap); if (ret) - return ret; + goto err_bulk_unprepare; platform_set_drvdata(pdev, i2c); return 0; + +err_bulk_unprepare: + clk_bulk_unprepare(I2C_MT65XX_CLK_MAX, i2c->clocks); + + return ret; } static int mtk_i2c_remove(struct platform_device *pdev) diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index 5960ccde6574..aede9d551130 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -2372,8 +2372,7 @@ static struct platform_driver npcm_i2c_bus_driver = { static int __init npcm_i2c_init(void) { npcm_i2c_debugfs_dir = debugfs_create_dir("npcm_i2c", NULL); - platform_driver_register(&npcm_i2c_bus_driver); - return 0; + return platform_driver_register(&npcm_i2c_bus_driver); } module_init(npcm_i2c_init); diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 4ab1038b5482..1f23a6be7d88 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -298,7 +298,7 @@ config XTENSA_MX config XILINX_INTC bool "Xilinx Interrupt Controller IP" - depends on MICROBLAZE || ARCH_ZYNQ || ARCH_ZYNQMP + depends on OF select IRQ_DOMAIN help Support for the Xilinx Interrupt Controller IP core. diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 12dd48727a15..5ac83185ff47 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -1035,6 +1035,7 @@ static void build_fiq_affinity(struct aic_irq_chip *ic, struct device_node *aff) continue; cpu = of_cpu_node_to_id(cpu_node); + of_node_put(cpu_node); if (WARN_ON(cpu < 0)) continue; @@ -1143,6 +1144,7 @@ static int __init aic_of_ic_init(struct device_node *node, struct device_node *p for_each_child_of_node(affs, chld) build_fiq_affinity(irqc, chld); } + of_node_put(affs); set_handle_irq(aic_handle_irq); set_handle_fiq(aic_handle_fiq); diff --git a/drivers/irqchip/irq-gic-realview.c b/drivers/irqchip/irq-gic-realview.c index b4c1924f0255..38fab02ffe9d 100644 --- a/drivers/irqchip/irq-gic-realview.c +++ b/drivers/irqchip/irq-gic-realview.c @@ -57,6 +57,7 @@ realview_gic_of_init(struct device_node *node, struct device_node *parent) /* The PB11MPCore GIC needs to be configured in the syscon */ map = syscon_node_to_regmap(np); + of_node_put(np); if (!IS_ERR(map)) { /* new irq mode with no DCC */ regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 2be8dea6b6b0..5c1cf907ee68 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1932,7 +1932,7 @@ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) gic_data.ppi_descs = kcalloc(gic_data.ppi_nr, sizeof(*gic_data.ppi_descs), GFP_KERNEL); if (!gic_data.ppi_descs) - return; + goto out_put_node; nr_parts = of_get_child_count(parts_node); @@ -1973,12 +1973,15 @@ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) continue; cpu = of_cpu_node_to_id(cpu_node); - if (WARN_ON(cpu < 0)) + if (WARN_ON(cpu < 0)) { + of_node_put(cpu_node); continue; + } pr_cont("%pOF[%d] ", cpu_node, cpu); cpumask_set_cpu(cpu, &part->mask); + of_node_put(cpu_node); } pr_cont("}\n"); diff --git a/drivers/irqchip/irq-loongson-liointc.c b/drivers/irqchip/irq-loongson-liointc.c index aed88857d90f..8d05d8bcf56f 100644 --- a/drivers/irqchip/irq-loongson-liointc.c +++ b/drivers/irqchip/irq-loongson-liointc.c @@ -39,6 +39,12 @@ #define LIOINTC_ERRATA_IRQ 10 +#if defined(CONFIG_MIPS) +#define liointc_core_id get_ebase_cpunum() +#else +#define liointc_core_id get_csr_cpuid() +#endif + struct liointc_handler_data { struct liointc_priv *priv; u32 parent_int_map; @@ -57,7 +63,7 @@ static void liointc_chained_handle_irq(struct irq_desc *desc) struct liointc_handler_data *handler = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); struct irq_chip_generic *gc = handler->priv->gc; - int core = cpu_logical_map(smp_processor_id()) % LIOINTC_NUM_CORES; + int core = liointc_core_id % LIOINTC_NUM_CORES; u32 pending; chained_irq_enter(chip, desc); diff --git a/drivers/irqchip/irq-realtek-rtl.c b/drivers/irqchip/irq-realtek-rtl.c index 50a56820c99b..56bf502d9c67 100644 --- a/drivers/irqchip/irq-realtek-rtl.c +++ b/drivers/irqchip/irq-realtek-rtl.c @@ -134,9 +134,9 @@ static int __init map_interrupts(struct device_node *node, struct irq_domain *do if (!cpu_ictl) return -EINVAL; ret = of_property_read_u32(cpu_ictl, "#interrupt-cells", &tmp); + of_node_put(cpu_ictl); if (ret || tmp != 1) return -EINVAL; - of_node_put(cpu_ictl); cpu_int = be32_to_cpup(imap + 2); if (cpu_int > 7 || cpu_int < 2) diff --git a/drivers/irqchip/irq-uniphier-aidet.c b/drivers/irqchip/irq-uniphier-aidet.c index 89121b39be26..716b1bb88bf2 100644 --- a/drivers/irqchip/irq-uniphier-aidet.c +++ b/drivers/irqchip/irq-uniphier-aidet.c @@ -237,6 +237,7 @@ static const struct of_device_id uniphier_aidet_match[] = { { .compatible = "socionext,uniphier-ld11-aidet" }, { .compatible = "socionext,uniphier-ld20-aidet" }, { .compatible = "socionext,uniphier-pxs3-aidet" }, + { .compatible = "socionext,uniphier-nx1-aidet" }, { /* sentinel */ } }; diff --git a/drivers/misc/cardreader/rts5261.c b/drivers/misc/cardreader/rts5261.c index 749cc5a46d13..b1e76030cafd 100644 --- a/drivers/misc/cardreader/rts5261.c +++ b/drivers/misc/cardreader/rts5261.c @@ -407,6 +407,8 @@ static void rts5261_init_from_hw(struct rtsx_pcr *pcr) // default setting_reg1 = PCR_SETTING_REG1; setting_reg2 = PCR_SETTING_REG2; + } else { + return; } pci_read_config_dword(pdev, setting_reg2, &lval2); diff --git a/drivers/misc/mei/hbm.c b/drivers/misc/mei/hbm.c index cebcca6d6d3e..cf2b8261da14 100644 --- a/drivers/misc/mei/hbm.c +++ b/drivers/misc/mei/hbm.c @@ -1351,7 +1351,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_CAP_SETUP) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: capabilities response: on shutdown, ignoring\n"); return 0; } diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index 64ce3f830262..15e8e2b322b1 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -109,6 +109,8 @@ #define MEI_DEV_ID_ADP_P 0x51E0 /* Alder Lake Point P */ #define MEI_DEV_ID_ADP_N 0x54E0 /* Alder Lake Point N */ +#define MEI_DEV_ID_RPL_S 0x7A68 /* Raptor Lake Point S */ + /* * MEI HW Section */ diff --git a/drivers/misc/mei/hw-me.c b/drivers/misc/mei/hw-me.c index 9870bf717979..befa491e3344 100644 --- a/drivers/misc/mei/hw-me.c +++ b/drivers/misc/mei/hw-me.c @@ -1154,6 +1154,8 @@ static int mei_me_hw_reset(struct mei_device *dev, bool intr_enable) ret = mei_me_d0i3_exit_sync(dev); if (ret) return ret; + } else { + hw->pg_state = MEI_PG_OFF; } } diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 33e58821e478..5435604327a7 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -116,6 +116,8 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_ADP_P, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ADP_N, MEI_ME_PCH15_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_RPL_S, MEI_ME_PCH15_CFG)}, + /* required last entry */ {0, } }; diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 195dc897188b..9da4489dc345 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -1356,7 +1356,7 @@ static void msdc_data_xfer_next(struct msdc_host *host, struct mmc_request *mrq) msdc_request_done(host, mrq); } -static bool msdc_data_xfer_done(struct msdc_host *host, u32 events, +static void msdc_data_xfer_done(struct msdc_host *host, u32 events, struct mmc_request *mrq, struct mmc_data *data) { struct mmc_command *stop; @@ -1376,7 +1376,7 @@ static bool msdc_data_xfer_done(struct msdc_host *host, u32 events, spin_unlock_irqrestore(&host->lock, flags); if (done) - return true; + return; stop = data->stop; if (check_data || (stop && stop->error)) { @@ -1385,12 +1385,15 @@ static bool msdc_data_xfer_done(struct msdc_host *host, u32 events, sdr_set_field(host->base + MSDC_DMA_CTRL, MSDC_DMA_CTRL_STOP, 1); + ret = readl_poll_timeout_atomic(host->base + MSDC_DMA_CTRL, val, + !(val & MSDC_DMA_CTRL_STOP), 1, 20000); + if (ret) + dev_dbg(host->dev, "DMA stop timed out\n"); + ret = readl_poll_timeout_atomic(host->base + MSDC_DMA_CFG, val, !(val & MSDC_DMA_CFG_STS), 1, 20000); - if (ret) { - dev_dbg(host->dev, "DMA stop timed out\n"); - return false; - } + if (ret) + dev_dbg(host->dev, "DMA inactive timed out\n"); sdr_clr_bits(host->base + MSDC_INTEN, data_ints_mask); dev_dbg(host->dev, "DMA stop\n"); @@ -1415,9 +1418,7 @@ static bool msdc_data_xfer_done(struct msdc_host *host, u32 events, } msdc_data_xfer_next(host, mrq); - done = true; } - return done; } static void msdc_set_buswidth(struct msdc_host *host, u32 width) @@ -2416,6 +2417,9 @@ static void msdc_cqe_disable(struct mmc_host *mmc, bool recovery) if (recovery) { sdr_set_field(host->base + MSDC_DMA_CTRL, MSDC_DMA_CTRL_STOP, 1); + if (WARN_ON(readl_poll_timeout(host->base + MSDC_DMA_CTRL, val, + !(val & MSDC_DMA_CTRL_STOP), 1, 3000))) + return; if (WARN_ON(readl_poll_timeout(host->base + MSDC_DMA_CFG, val, !(val & MSDC_DMA_CFG_STS), 1, 3000))) return; diff --git a/drivers/mmc/host/sdhci-pci-o2micro.c b/drivers/mmc/host/sdhci-pci-o2micro.c index 92c20cb8074a..0d4d343dbb77 100644 --- a/drivers/mmc/host/sdhci-pci-o2micro.c +++ b/drivers/mmc/host/sdhci-pci-o2micro.c @@ -152,6 +152,8 @@ static int sdhci_o2_get_cd(struct mmc_host *mmc) if (!(sdhci_readw(host, O2_PLL_DLL_WDT_CONTROL1) & O2_PLL_LOCK_STATUS)) sdhci_o2_enable_internal_clock(host); + else + sdhci_o2_wait_card_detect_stable(host); return !!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT); } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f85372adf042..6ba4c83fe5fc 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3684,9 +3684,11 @@ re_arm: if (!rtnl_trylock()) return; - if (should_notify_peers) + if (should_notify_peers) { + bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); + } if (should_notify_rtnl) { bond_slave_state_notify(bond); bond_slave_link_notify(bond); diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 2727d3169c25..1cbb05b0323f 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -2334,6 +2334,7 @@ static int qca8k_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu) { struct qca8k_priv *priv = ds->priv; + int ret; /* We have only have a general MTU setting. * DSA always set the CPU port's MTU to the largest MTU of the slave @@ -2344,8 +2345,27 @@ qca8k_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu) if (!dsa_is_cpu_port(ds, port)) return 0; + /* To change the MAX_FRAME_SIZE the cpu ports must be off or + * the switch panics. + * Turn off both cpu ports before applying the new value to prevent + * this. + */ + if (priv->port_enabled_map & BIT(0)) + qca8k_port_set_status(priv, 0, 0); + + if (priv->port_enabled_map & BIT(6)) + qca8k_port_set_status(priv, 6, 0); + /* Include L2 header / FCS length */ - return qca8k_write(priv, QCA8K_MAX_FRAME_SIZE, new_mtu + ETH_HLEN + ETH_FCS_LEN); + ret = qca8k_write(priv, QCA8K_MAX_FRAME_SIZE, new_mtu + ETH_HLEN + ETH_FCS_LEN); + + if (priv->port_enabled_map & BIT(0)) + qca8k_port_set_status(priv, 0, 1); + + if (priv->port_enabled_map & BIT(6)) + qca8k_port_set_status(priv, 6, 1); + + return ret; } static int diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h index 04408e11402a..ec58d0e80a70 100644 --- a/drivers/net/dsa/qca8k.h +++ b/drivers/net/dsa/qca8k.h @@ -15,7 +15,7 @@ #define QCA8K_ETHERNET_MDIO_PRIORITY 7 #define QCA8K_ETHERNET_PHY_PRIORITY 6 -#define QCA8K_ETHERNET_TIMEOUT 100 +#define QCA8K_ETHERNET_TIMEOUT 5 #define QCA8K_NUM_PORTS 7 #define QCA8K_NUM_CPU_PORTS 2 diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c index 60ae8bfc5f69..1749d26f4bef 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c @@ -43,9 +43,7 @@ static bool check_image_valid(struct hinic_devlink_priv *priv, const u8 *buf, for (i = 0; i < fw_image->fw_info.fw_section_cnt; i++) { len += fw_image->fw_section_info[i].fw_section_len; - memcpy(&host_image->image_section_info[i], - &fw_image->fw_section_info[i], - sizeof(struct fw_section_info_st)); + host_image->image_section_info[i] = fw_image->fw_section_info[i]; } if (len != fw_image->fw_len || diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 1e71b70f0e52..70335f6e8524 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -2190,6 +2190,42 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks, } /** + * ice_set_phy_type_from_speed - set phy_types based on speeds + * and advertised modes + * @ks: ethtool link ksettings struct + * @phy_type_low: pointer to the lower part of phy_type + * @phy_type_high: pointer to the higher part of phy_type + * @adv_link_speed: targeted link speeds bitmap + */ +static void +ice_set_phy_type_from_speed(const struct ethtool_link_ksettings *ks, + u64 *phy_type_low, u64 *phy_type_high, + u16 adv_link_speed) +{ + /* Handle 1000M speed in a special way because ice_update_phy_type + * enables all link modes, but having mixed copper and optical + * standards is not supported. + */ + adv_link_speed &= ~ICE_AQ_LINK_SPEED_1000MB; + + if (ethtool_link_ksettings_test_link_mode(ks, advertising, + 1000baseT_Full)) + *phy_type_low |= ICE_PHY_TYPE_LOW_1000BASE_T | + ICE_PHY_TYPE_LOW_1G_SGMII; + + if (ethtool_link_ksettings_test_link_mode(ks, advertising, + 1000baseKX_Full)) + *phy_type_low |= ICE_PHY_TYPE_LOW_1000BASE_KX; + + if (ethtool_link_ksettings_test_link_mode(ks, advertising, + 1000baseX_Full)) + *phy_type_low |= ICE_PHY_TYPE_LOW_1000BASE_SX | + ICE_PHY_TYPE_LOW_1000BASE_LX; + + ice_update_phy_type(phy_type_low, phy_type_high, adv_link_speed); +} + +/** * ice_set_link_ksettings - Set Speed and Duplex * @netdev: network interface device structure * @ks: ethtool ksettings @@ -2320,7 +2356,8 @@ ice_set_link_ksettings(struct net_device *netdev, adv_link_speed = curr_link_speed; /* Convert the advertise link speeds to their corresponded PHY_TYPE */ - ice_update_phy_type(&phy_type_low, &phy_type_high, adv_link_speed); + ice_set_phy_type_from_speed(ks, &phy_type_low, &phy_type_high, + adv_link_speed); if (!autoneg_changed && adv_link_speed == curr_link_speed) { netdev_info(netdev, "Nothing changed, exiting without setting anything.\n"); @@ -3470,6 +3507,16 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) new_rx = ch->combined_count + ch->rx_count; new_tx = ch->combined_count + ch->tx_count; + if (new_rx < vsi->tc_cfg.numtc) { + netdev_err(dev, "Cannot set less Rx channels, than Traffic Classes you have (%u)\n", + vsi->tc_cfg.numtc); + return -EINVAL; + } + if (new_tx < vsi->tc_cfg.numtc) { + netdev_err(dev, "Cannot set less Tx channels, than Traffic Classes you have (%u)\n", + vsi->tc_cfg.numtc); + return -EINVAL; + } if (new_rx > ice_get_max_rxq(pf)) { netdev_err(dev, "Maximum allowed Rx channels is %d\n", ice_get_max_rxq(pf)); diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 454e01ae09b9..f7f9c973ec54 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -909,7 +909,7 @@ static void ice_set_dflt_vsi_ctx(struct ice_hw *hw, struct ice_vsi_ctx *ctxt) * @vsi: the VSI being configured * @ctxt: VSI context structure */ -static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) +static int ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) { u16 offset = 0, qmap = 0, tx_count = 0, pow = 0; u16 num_txq_per_tc, num_rxq_per_tc; @@ -982,7 +982,18 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) else vsi->num_rxq = num_rxq_per_tc; + if (vsi->num_rxq > vsi->alloc_rxq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n", + vsi->num_rxq, vsi->alloc_rxq); + return -EINVAL; + } + vsi->num_txq = tx_count; + if (vsi->num_txq > vsi->alloc_txq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n", + vsi->num_txq, vsi->alloc_txq); + return -EINVAL; + } if (vsi->type == ICE_VSI_VF && vsi->num_txq != vsi->num_rxq) { dev_dbg(ice_pf_to_dev(vsi->back), "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n"); @@ -1000,6 +1011,8 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) */ ctxt->info.q_mapping[0] = cpu_to_le16(vsi->rxq_map[0]); ctxt->info.q_mapping[1] = cpu_to_le16(vsi->num_rxq); + + return 0; } /** @@ -1187,7 +1200,10 @@ static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi) if (vsi->type == ICE_VSI_CHNL) { ice_chnl_vsi_setup_q_map(vsi, ctxt); } else { - ice_vsi_setup_q_map(vsi, ctxt); + ret = ice_vsi_setup_q_map(vsi, ctxt); + if (ret) + goto out; + if (!init_vsi) /* means VSI being updated */ /* must to indicate which section of VSI context are * being modified @@ -3464,7 +3480,7 @@ void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc) * * Prepares VSI tc_config to have queue configurations based on MQPRIO options. */ -static void +static int ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt, u8 ena_tc) { @@ -3513,7 +3529,18 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt, /* Set actual Tx/Rx queue pairs */ vsi->num_txq = offset + qcount_tx; + if (vsi->num_txq > vsi->alloc_txq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n", + vsi->num_txq, vsi->alloc_txq); + return -EINVAL; + } + vsi->num_rxq = offset + qcount_rx; + if (vsi->num_rxq > vsi->alloc_rxq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n", + vsi->num_rxq, vsi->alloc_rxq); + return -EINVAL; + } /* Setup queue TC[0].qmap for given VSI context */ ctxt->info.tc_mapping[0] = cpu_to_le16(qmap); @@ -3531,6 +3558,8 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt, dev_dbg(ice_pf_to_dev(vsi->back), "vsi->num_rxq = %d\n", vsi->num_rxq); dev_dbg(ice_pf_to_dev(vsi->back), "all_numtc %u, all_enatc: 0x%04x, tc_cfg.numtc %u\n", vsi->all_numtc, vsi->all_enatc, vsi->tc_cfg.numtc); + + return 0; } /** @@ -3580,9 +3609,12 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc) if (vsi->type == ICE_VSI_PF && test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) - ice_vsi_setup_q_map_mqprio(vsi, ctx, ena_tc); + ret = ice_vsi_setup_q_map_mqprio(vsi, ctx, ena_tc); else - ice_vsi_setup_q_map(vsi, ctx); + ret = ice_vsi_setup_q_map(vsi, ctx); + + if (ret) + goto out; /* must to indicate which section of VSI context are being modified */ ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID); diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 0a0c55fb8699..b803f2ab3cc7 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -524,6 +524,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) */ fltr->rid = rule_added.rid; fltr->rule_id = rule_added.rule_id; + fltr->dest_id = rule_added.vsi_handle; exit: kfree(list); @@ -993,7 +994,9 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, n_proto_key = ntohs(match.key->n_proto); n_proto_mask = ntohs(match.mask->n_proto); - if (n_proto_key == ETH_P_ALL || n_proto_key == 0) { + if (n_proto_key == ETH_P_ALL || n_proto_key == 0 || + fltr->tunnel_type == TNL_GTPU || + fltr->tunnel_type == TNL_GTPC) { n_proto_key = 0; n_proto_mask = 0; } else { diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 68be2976f539..c5f04c40284b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4819,8 +4819,11 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring) while (i != tx_ring->next_to_use) { union e1000_adv_tx_desc *eop_desc, *tx_desc; - /* Free all the Tx ring sk_buffs */ - dev_kfree_skb_any(tx_buffer->skb); + /* Free all the Tx ring sk_buffs or xdp frames */ + if (tx_buffer->type == IGB_TYPE_SKB) + dev_kfree_skb_any(tx_buffer->skb); + else + xdp_return_frame(tx_buffer->xdpf); /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -9898,11 +9901,10 @@ static void igb_init_dmac(struct igb_adapter *adapter, u32 pba) struct e1000_hw *hw = &adapter->hw; u32 dmac_thr; u16 hwm; + u32 reg; if (hw->mac.type > e1000_82580) { if (adapter->flags & IGB_FLAG_DMAC) { - u32 reg; - /* force threshold to 0. */ wr32(E1000_DMCTXTH, 0); @@ -9935,7 +9937,6 @@ static void igb_init_dmac(struct igb_adapter *adapter, u32 pba) /* Disable BMC-to-OS Watchdog Enable */ if (hw->mac.type != e1000_i354) reg &= ~E1000_DMACR_DC_BMC2OSW_EN; - wr32(E1000_DMACR, reg); /* no lower threshold to disable @@ -9952,12 +9953,12 @@ static void igb_init_dmac(struct igb_adapter *adapter, u32 pba) */ wr32(E1000_DMCTXTH, (IGB_MIN_TXPBSIZE - (IGB_TX_BUF_4096 + adapter->max_frame_size)) >> 6); + } - /* make low power state decision controlled - * by DMA coal - */ + if (hw->mac.type >= e1000_i210 || + (adapter->flags & IGB_FLAG_DMAC)) { reg = rd32(E1000_PCIEMISC); - reg &= ~E1000_PCIEMISC_LX_DECISION; + reg |= E1000_PCIEMISC_LX_DECISION; wr32(E1000_PCIEMISC, reg); } /* endif adapter->dmac is not disabled */ } else if (hw->mac.type == e1000_82580) { diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 45c3c4a1101b..9fb567524220 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -99,6 +99,7 @@ struct sixpack { unsigned int rx_count; unsigned int rx_count_cooked; + spinlock_t rxlock; int mtu; /* Our mtu (to spot changes!) */ int buffsize; /* Max buffers sizes */ @@ -565,6 +566,7 @@ static int sixpack_open(struct tty_struct *tty) sp->dev = dev; spin_lock_init(&sp->lock); + spin_lock_init(&sp->rxlock); refcount_set(&sp->refcnt, 1); init_completion(&sp->dead); @@ -913,6 +915,7 @@ static void decode_std_command(struct sixpack *sp, unsigned char cmd) sp->led_state = 0x60; /* fill trailing bytes with zeroes */ sp->tty->ops->write(sp->tty, &sp->led_state, 1); + spin_lock_bh(&sp->rxlock); rest = sp->rx_count; if (rest != 0) for (i = rest; i <= 3; i++) @@ -930,6 +933,7 @@ static void decode_std_command(struct sixpack *sp, unsigned char cmd) sp_bump(sp, 0); } sp->rx_count_cooked = 0; + spin_unlock_bh(&sp->rxlock); } break; case SIXP_TX_URUN: printk(KERN_DEBUG "6pack: TX underrun\n"); @@ -959,8 +963,11 @@ sixpack_decode(struct sixpack *sp, const unsigned char *pre_rbuff, int count) decode_prio_command(sp, inbyte); else if ((inbyte & SIXP_STD_CMD_MASK) != 0) decode_std_command(sp, inbyte); - else if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK) + else if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK) { + spin_lock_bh(&sp->rxlock); decode_data(sp, inbyte); + spin_unlock_bh(&sp->rxlock); + } } } diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c index a8db1a19011b..c7047f5d7a9b 100644 --- a/drivers/net/phy/aquantia_main.c +++ b/drivers/net/phy/aquantia_main.c @@ -34,6 +34,8 @@ #define MDIO_AN_VEND_PROV 0xc400 #define MDIO_AN_VEND_PROV_1000BASET_FULL BIT(15) #define MDIO_AN_VEND_PROV_1000BASET_HALF BIT(14) +#define MDIO_AN_VEND_PROV_5000BASET_FULL BIT(11) +#define MDIO_AN_VEND_PROV_2500BASET_FULL BIT(10) #define MDIO_AN_VEND_PROV_DOWNSHIFT_EN BIT(4) #define MDIO_AN_VEND_PROV_DOWNSHIFT_MASK GENMASK(3, 0) #define MDIO_AN_VEND_PROV_DOWNSHIFT_DFLT 4 @@ -231,9 +233,20 @@ static int aqr_config_aneg(struct phy_device *phydev) phydev->advertising)) reg |= MDIO_AN_VEND_PROV_1000BASET_HALF; + /* Handle the case when the 2.5G and 5G speeds are not advertised */ + if (linkmode_test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + phydev->advertising)) + reg |= MDIO_AN_VEND_PROV_2500BASET_FULL; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + phydev->advertising)) + reg |= MDIO_AN_VEND_PROV_5000BASET_FULL; + ret = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_VEND_PROV, MDIO_AN_VEND_PROV_1000BASET_HALF | - MDIO_AN_VEND_PROV_1000BASET_FULL, reg); + MDIO_AN_VEND_PROV_1000BASET_FULL | + MDIO_AN_VEND_PROV_2500BASET_FULL | + MDIO_AN_VEND_PROV_5000BASET_FULL, reg); if (ret < 0) return ret; if (ret > 0) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 6a467e7817a6..59fe356942b5 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -2072,6 +2072,8 @@ static struct phy_driver at803x_driver[] = { /* ATHEROS AR9331 */ PHY_ID_MATCH_EXACT(ATH9331_PHY_ID), .name = "Qualcomm Atheros AR9331 built-in PHY", + .probe = at803x_probe, + .remove = at803x_remove, .suspend = at803x_suspend, .resume = at803x_resume, .flags = PHY_POLL_CABLE_TEST, @@ -2087,6 +2089,8 @@ static struct phy_driver at803x_driver[] = { /* Qualcomm Atheros QCA9561 */ PHY_ID_MATCH_EXACT(QCA9561_PHY_ID), .name = "Qualcomm Atheros QCA9561 built-in PHY", + .probe = at803x_probe, + .remove = at803x_remove, .suspend = at803x_suspend, .resume = at803x_resume, .flags = PHY_POLL_CABLE_TEST, @@ -2151,6 +2155,8 @@ static struct phy_driver at803x_driver[] = { PHY_ID_MATCH_EXACT(QCA8081_PHY_ID), .name = "Qualcomm QCA8081", .flags = PHY_POLL_CABLE_TEST, + .probe = at803x_probe, + .remove = at803x_remove, .config_intr = at803x_config_intr, .handle_interrupt = at803x_handle_interrupt, .get_tunable = at803x_get_tunable, diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c index 1b54684b68a0..96d3c40932d8 100644 --- a/drivers/net/phy/smsc.c +++ b/drivers/net/phy/smsc.c @@ -110,7 +110,7 @@ static int smsc_phy_config_init(struct phy_device *phydev) struct smsc_phy_priv *priv = phydev->priv; int rc; - if (!priv->energy_enable) + if (!priv->energy_enable || phydev->irq != PHY_POLL) return 0; rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS); @@ -210,6 +210,8 @@ static int lan95xx_config_aneg_ext(struct phy_device *phydev) * response on link pulses to detect presence of plugged Ethernet cable. * The Energy Detect Power-Down mode is enabled again in the end of procedure to * save approximately 220 mW of power if cable is unplugged. + * The workaround is only applicable to poll mode. Energy Detect Power-Down may + * not be used in interrupt mode lest link change detection becomes unreliable. */ static int lan87xx_read_status(struct phy_device *phydev) { @@ -217,7 +219,7 @@ static int lan87xx_read_status(struct phy_device *phydev) int err = genphy_read_status(phydev); - if (!phydev->link && priv->energy_enable) { + if (!phydev->link && priv->energy_enable && phydev->irq == PHY_POLL) { /* Disable EDPD to wake up PHY */ int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS); if (rc < 0) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 466da01ba2e3..2cb833b3006a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -312,6 +312,7 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct netdev_queue *queue = NULL; struct veth_rq *rq = NULL; struct net_device *rcv; int length = skb->len; @@ -329,6 +330,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; + queue = netdev_get_tx_queue(dev, rxq); /* The napi pointer is available when an XDP program is * attached or when GRO is enabled @@ -340,6 +342,8 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) skb_tx_timestamp(skb); if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { + if (queue) + txq_trans_cond_update(queue); if (!use_napi) dev_lstats_add(dev, length); } else { diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index db05b5e930be..969a67970e71 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2768,7 +2768,6 @@ static const struct ethtool_ops virtnet_ethtool_ops = { static void virtnet_freeze_down(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; - int i; /* Make sure no work handler is accessing the device */ flush_work(&vi->config_work); @@ -2776,14 +2775,8 @@ static void virtnet_freeze_down(struct virtio_device *vdev) netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); netif_tx_unlock_bh(vi->dev); - cancel_delayed_work_sync(&vi->refill); - - if (netif_running(vi->dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) { - napi_disable(&vi->rq[i].napi); - virtnet_napi_tx_disable(&vi->sq[i].napi); - } - } + if (netif_running(vi->dev)) + virtnet_close(vi->dev); } static int init_vqs(struct virtnet_info *vi); @@ -2791,7 +2784,7 @@ static int init_vqs(struct virtnet_info *vi); static int virtnet_restore_up(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; - int err, i; + int err; err = init_vqs(vi); if (err) @@ -2800,15 +2793,9 @@ static int virtnet_restore_up(struct virtio_device *vdev) virtio_device_ready(vdev); if (netif_running(vi->dev)) { - for (i = 0; i < vi->curr_queue_pairs; i++) - if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) - schedule_delayed_work(&vi->refill, 0); - - for (i = 0; i < vi->max_queue_pairs; i++) { - virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); - virtnet_napi_tx_enable(vi, vi->sq[i].vq, - &vi->sq[i].napi); - } + err = virtnet_open(vi->dev); + if (err) + return err; } netif_tx_lock_bh(vi->dev); diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index d0eab5700dc5..00684e11976b 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -160,8 +160,8 @@ static void ibmvfc_npiv_logout(struct ibmvfc_host *); static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *); static void ibmvfc_tgt_move_login(struct ibmvfc_target *); -static void ibmvfc_release_sub_crqs(struct ibmvfc_host *); -static void ibmvfc_init_sub_crqs(struct ibmvfc_host *); +static void ibmvfc_dereg_sub_crqs(struct ibmvfc_host *); +static void ibmvfc_reg_sub_crqs(struct ibmvfc_host *); static const char *unknown_error = "unknown error"; @@ -917,7 +917,7 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) struct vio_dev *vdev = to_vio_dev(vhost->dev); unsigned long flags; - ibmvfc_release_sub_crqs(vhost); + ibmvfc_dereg_sub_crqs(vhost); /* Re-enable the CRQ */ do { @@ -936,7 +936,7 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) spin_unlock(vhost->crq.q_lock); spin_unlock_irqrestore(vhost->host->host_lock, flags); - ibmvfc_init_sub_crqs(vhost); + ibmvfc_reg_sub_crqs(vhost); return rc; } @@ -955,7 +955,7 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) struct vio_dev *vdev = to_vio_dev(vhost->dev); struct ibmvfc_queue *crq = &vhost->crq; - ibmvfc_release_sub_crqs(vhost); + ibmvfc_dereg_sub_crqs(vhost); /* Close the CRQ */ do { @@ -988,7 +988,7 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) spin_unlock(vhost->crq.q_lock); spin_unlock_irqrestore(vhost->host->host_lock, flags); - ibmvfc_init_sub_crqs(vhost); + ibmvfc_reg_sub_crqs(vhost); return rc; } @@ -5682,6 +5682,8 @@ static int ibmvfc_alloc_queue(struct ibmvfc_host *vhost, queue->cur = 0; queue->fmt = fmt; queue->size = PAGE_SIZE / fmt_size; + + queue->vhost = vhost; return 0; } @@ -5757,9 +5759,6 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, ENTER; - if (ibmvfc_alloc_queue(vhost, scrq, IBMVFC_SUB_CRQ_FMT)) - return -ENOMEM; - rc = h_reg_sub_crq(vdev->unit_address, scrq->msg_token, PAGE_SIZE, &scrq->cookie, &scrq->hw_irq); @@ -5790,7 +5789,6 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, } scrq->hwq_id = index; - scrq->vhost = vhost; LEAVE; return 0; @@ -5800,7 +5798,6 @@ irq_failed: rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); } while (rtas_busy_delay(rc)); reg_failed: - ibmvfc_free_queue(vhost, scrq); LEAVE; return rc; } @@ -5826,12 +5823,50 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) if (rc) dev_err(dev, "Failed to free sub-crq[%d]: rc=%ld\n", index, rc); - ibmvfc_free_queue(vhost, scrq); + /* Clean out the queue */ + memset(scrq->msgs.crq, 0, PAGE_SIZE); + scrq->cur = 0; + + LEAVE; +} + +static void ibmvfc_reg_sub_crqs(struct ibmvfc_host *vhost) +{ + int i, j; + + ENTER; + if (!vhost->mq_enabled || !vhost->scsi_scrqs.scrqs) + return; + + for (i = 0; i < nr_scsi_hw_queues; i++) { + if (ibmvfc_register_scsi_channel(vhost, i)) { + for (j = i; j > 0; j--) + ibmvfc_deregister_scsi_channel(vhost, j - 1); + vhost->do_enquiry = 0; + return; + } + } + + LEAVE; +} + +static void ibmvfc_dereg_sub_crqs(struct ibmvfc_host *vhost) +{ + int i; + + ENTER; + if (!vhost->mq_enabled || !vhost->scsi_scrqs.scrqs) + return; + + for (i = 0; i < nr_scsi_hw_queues; i++) + ibmvfc_deregister_scsi_channel(vhost, i); + LEAVE; } static void ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) { + struct ibmvfc_queue *scrq; int i, j; ENTER; @@ -5847,30 +5882,41 @@ static void ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) } for (i = 0; i < nr_scsi_hw_queues; i++) { - if (ibmvfc_register_scsi_channel(vhost, i)) { - for (j = i; j > 0; j--) - ibmvfc_deregister_scsi_channel(vhost, j - 1); + scrq = &vhost->scsi_scrqs.scrqs[i]; + if (ibmvfc_alloc_queue(vhost, scrq, IBMVFC_SUB_CRQ_FMT)) { + for (j = i; j > 0; j--) { + scrq = &vhost->scsi_scrqs.scrqs[j - 1]; + ibmvfc_free_queue(vhost, scrq); + } kfree(vhost->scsi_scrqs.scrqs); vhost->scsi_scrqs.scrqs = NULL; vhost->scsi_scrqs.active_queues = 0; vhost->do_enquiry = 0; - break; + vhost->mq_enabled = 0; + return; } } + ibmvfc_reg_sub_crqs(vhost); + LEAVE; } static void ibmvfc_release_sub_crqs(struct ibmvfc_host *vhost) { + struct ibmvfc_queue *scrq; int i; ENTER; if (!vhost->scsi_scrqs.scrqs) return; - for (i = 0; i < nr_scsi_hw_queues; i++) - ibmvfc_deregister_scsi_channel(vhost, i); + ibmvfc_dereg_sub_crqs(vhost); + + for (i = 0; i < nr_scsi_hw_queues; i++) { + scrq = &vhost->scsi_scrqs.scrqs[i]; + ibmvfc_free_queue(vhost, scrq); + } kfree(vhost->scsi_scrqs.scrqs); vhost->scsi_scrqs.scrqs = NULL; diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h index 3718406e0988..c39a245f43d0 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.h +++ b/drivers/scsi/ibmvscsi/ibmvfc.h @@ -789,6 +789,7 @@ struct ibmvfc_queue { spinlock_t _lock; spinlock_t *q_lock; + struct ibmvfc_host *vhost; struct ibmvfc_event_pool evt_pool; struct list_head sent; struct list_head free; @@ -797,7 +798,6 @@ struct ibmvfc_queue { union ibmvfc_iu cancel_rsp; /* Sub-CRQ fields */ - struct ibmvfc_host *vhost; unsigned long cookie; unsigned long vios_cookie; unsigned long hw_irq; diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 1f423f723d06..b8a76b89f85a 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2826,6 +2826,24 @@ static void zbc_open_zone(struct sdebug_dev_info *devip, } } +static inline void zbc_set_zone_full(struct sdebug_dev_info *devip, + struct sdeb_zone_state *zsp) +{ + switch (zsp->z_cond) { + case ZC2_IMPLICIT_OPEN: + devip->nr_imp_open--; + break; + case ZC3_EXPLICIT_OPEN: + devip->nr_exp_open--; + break; + default: + WARN_ONCE(true, "Invalid zone %llu condition %x\n", + zsp->z_start, zsp->z_cond); + break; + } + zsp->z_cond = ZC5_FULL; +} + static void zbc_inc_wp(struct sdebug_dev_info *devip, unsigned long long lba, unsigned int num) { @@ -2838,7 +2856,7 @@ static void zbc_inc_wp(struct sdebug_dev_info *devip, if (zsp->z_type == ZBC_ZTYPE_SWR) { zsp->z_wp += num; if (zsp->z_wp >= zend) - zsp->z_cond = ZC5_FULL; + zbc_set_zone_full(devip, zsp); return; } @@ -2857,7 +2875,7 @@ static void zbc_inc_wp(struct sdebug_dev_info *devip, n = num; } if (zsp->z_wp >= zend) - zsp->z_cond = ZC5_FULL; + zbc_set_zone_full(devip, zsp); num -= n; lba += n; diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 2c0dd64159b0..5d21f07456c6 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -212,7 +212,12 @@ iscsi_create_endpoint(int dd_size) return NULL; mutex_lock(&iscsi_ep_idr_mutex); - id = idr_alloc(&iscsi_ep_idr, ep, 0, -1, GFP_NOIO); + + /* + * First endpoint id should be 1 to comply with user space + * applications (iscsid). + */ + id = idr_alloc(&iscsi_ep_idr, ep, 1, -1, GFP_NOIO); if (id < 0) { mutex_unlock(&iscsi_ep_idr_mutex); printk(KERN_ERR "Could not allocate endpoint ID. Error %d.\n", diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index ca3530982e52..fe000da11332 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1844,7 +1844,7 @@ static struct scsi_host_template scsi_driver = { .cmd_per_lun = 2048, .this_id = -1, /* Ensure there are no gaps in presented sgls */ - .virt_boundary_mask = PAGE_SIZE-1, + .virt_boundary_mask = HV_HYP_PAGE_SIZE - 1, .no_write_same = 1, .track_queue_depth = 1, .change_queue_depth = storvsc_change_queue_depth, @@ -1895,6 +1895,7 @@ static int storvsc_probe(struct hv_device *device, int target = 0; struct storvsc_device *stor_device; int max_sub_channels = 0; + u32 max_xfer_bytes; /* * We support sub-channels for storage on SCSI and FC controllers. @@ -1968,12 +1969,28 @@ static int storvsc_probe(struct hv_device *device, } /* max cmd length */ host->max_cmd_len = STORVSC_MAX_CMD_LEN; - /* - * set the table size based on the info we got - * from the host. + * Any reasonable Hyper-V configuration should provide + * max_transfer_bytes value aligning to HV_HYP_PAGE_SIZE, + * protecting it from any weird value. + */ + max_xfer_bytes = round_down(stor_device->max_transfer_bytes, HV_HYP_PAGE_SIZE); + /* max_hw_sectors_kb */ + host->max_sectors = max_xfer_bytes >> 9; + /* + * There are 2 requirements for Hyper-V storvsc sgl segments, + * based on which the below calculation for max segments is + * done: + * + * 1. Except for the first and last sgl segment, all sgl segments + * should be align to HV_HYP_PAGE_SIZE, that also means the + * maximum number of segments in a sgl can be calculated by + * dividing the total max transfer length by HV_HYP_PAGE_SIZE. + * + * 2. Except for the first and last, each entry in the SGL must + * have an offset that is a multiple of HV_HYP_PAGE_SIZE. */ - host->sg_tablesize = (stor_device->max_transfer_bytes >> PAGE_SHIFT); + host->sg_tablesize = (max_xfer_bytes >> HV_HYP_PAGE_SHIFT) + 1; /* * For non-IDE disks, the host supports multiple channels. * Set the number of HW queues we are supporting. diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 01fb4bad86be..ce86d1b790c0 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -748,17 +748,28 @@ static enum utp_ocs ufshcd_get_tr_ocs(struct ufshcd_lrb *lrbp) } /** - * ufshcd_utrl_clear - Clear a bit in UTRLCLR register + * ufshcd_utrl_clear() - Clear requests from the controller request list. * @hba: per adapter instance - * @pos: position of the bit to be cleared + * @mask: mask with one bit set for each request to be cleared */ -static inline void ufshcd_utrl_clear(struct ufs_hba *hba, u32 pos) +static inline void ufshcd_utrl_clear(struct ufs_hba *hba, u32 mask) { if (hba->quirks & UFSHCI_QUIRK_BROKEN_REQ_LIST_CLR) - ufshcd_writel(hba, (1 << pos), REG_UTP_TRANSFER_REQ_LIST_CLEAR); - else - ufshcd_writel(hba, ~(1 << pos), - REG_UTP_TRANSFER_REQ_LIST_CLEAR); + mask = ~mask; + /* + * From the UFSHCI specification: "UTP Transfer Request List CLear + * Register (UTRLCLR): This field is bit significant. Each bit + * corresponds to a slot in the UTP Transfer Request List, where bit 0 + * corresponds to request slot 0. A bit in this field is set to ‘0’ + * by host software to indicate to the host controller that a transfer + * request slot is cleared. The host controller + * shall free up any resources associated to the request slot + * immediately, and shall set the associated bit in UTRLDBR to ‘0’. The + * host software indicates no change to request slots by setting the + * associated bits in this field to ‘1’. Bits in this field shall only + * be set ‘1’ or ‘0’ by host software when UTRLRSR is set to ‘1’." + */ + ufshcd_writel(hba, ~mask, REG_UTP_TRANSFER_REQ_LIST_CLEAR); } /** @@ -2863,27 +2874,26 @@ static int ufshcd_compose_dev_cmd(struct ufs_hba *hba, return ufshcd_compose_devman_upiu(hba, lrbp); } -static int -ufshcd_clear_cmd(struct ufs_hba *hba, int tag) +/* + * Clear all the requests from the controller for which a bit has been set in + * @mask and wait until the controller confirms that these requests have been + * cleared. + */ +static int ufshcd_clear_cmds(struct ufs_hba *hba, u32 mask) { - int err = 0; unsigned long flags; - u32 mask = 1 << tag; /* clear outstanding transaction before retry */ spin_lock_irqsave(hba->host->host_lock, flags); - ufshcd_utrl_clear(hba, tag); + ufshcd_utrl_clear(hba, mask); spin_unlock_irqrestore(hba->host->host_lock, flags); /* * wait for h/w to clear corresponding bit in door-bell. * max. wait is 1 sec. */ - err = ufshcd_wait_for_register(hba, - REG_UTP_TRANSFER_REQ_DOOR_BELL, - mask, ~mask, 1000, 1000); - - return err; + return ufshcd_wait_for_register(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL, + mask, ~mask, 1000, 1000); } static int @@ -2963,7 +2973,7 @@ static int ufshcd_wait_for_dev_cmd(struct ufs_hba *hba, err = -ETIMEDOUT; dev_dbg(hba->dev, "%s: dev_cmd request timedout, tag %d\n", __func__, lrbp->task_tag); - if (!ufshcd_clear_cmd(hba, lrbp->task_tag)) + if (!ufshcd_clear_cmds(hba, 1U << lrbp->task_tag)) /* successfully cleared the command, retry if needed */ err = -EAGAIN; /* @@ -6958,14 +6968,14 @@ int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba, } /** - * ufshcd_eh_device_reset_handler - device reset handler registered to - * scsi layer. + * ufshcd_eh_device_reset_handler() - Reset a single logical unit. * @cmd: SCSI command pointer * * Returns SUCCESS/FAILED */ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) { + unsigned long flags, pending_reqs = 0, not_cleared = 0; struct Scsi_Host *host; struct ufs_hba *hba; u32 pos; @@ -6984,14 +6994,24 @@ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) } /* clear the commands that were pending for corresponding LUN */ - for_each_set_bit(pos, &hba->outstanding_reqs, hba->nutrs) { - if (hba->lrb[pos].lun == lun) { - err = ufshcd_clear_cmd(hba, pos); - if (err) - break; - __ufshcd_transfer_req_compl(hba, 1U << pos); - } + spin_lock_irqsave(&hba->outstanding_lock, flags); + for_each_set_bit(pos, &hba->outstanding_reqs, hba->nutrs) + if (hba->lrb[pos].lun == lun) + __set_bit(pos, &pending_reqs); + hba->outstanding_reqs &= ~pending_reqs; + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + if (ufshcd_clear_cmds(hba, pending_reqs) < 0) { + spin_lock_irqsave(&hba->outstanding_lock, flags); + not_cleared = pending_reqs & + ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL); + hba->outstanding_reqs |= not_cleared; + spin_unlock_irqrestore(&hba->outstanding_lock, flags); + + dev_err(hba->dev, "%s: failed to clear requests %#lx\n", + __func__, not_cleared); } + __ufshcd_transfer_req_compl(hba, pending_reqs & ~not_cleared); out: hba->req_abort_count = 0; @@ -7088,7 +7108,7 @@ static int ufshcd_try_to_abort_task(struct ufs_hba *hba, int tag) goto out; } - err = ufshcd_clear_cmd(hba, tag); + err = ufshcd_clear_cmds(hba, 1U << tag); if (err) dev_err(hba->dev, "%s: Failed clearing cmd at tag %d, err %d\n", __func__, tag, err); diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 79df61fe0e59..baf2b152229e 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -152,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, const unsigned char **wnames, *uname; int i, n, l, clone, access; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *old_fid = NULL; + struct p9_fid *fid, *old_fid; v9ses = v9fs_dentry2v9ses(dentry); access = v9ses->flags & V9FS_ACCESS_MASK; @@ -194,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, if (IS_ERR(fid)) return fid; + refcount_inc(&fid->count); v9fs_fid_add(dentry->d_sb->s_root, fid); } /* If we are root ourself just return that */ - if (dentry->d_sb->s_root == dentry) { - refcount_inc(&fid->count); + if (dentry->d_sb->s_root == dentry) return fid; - } /* * Do a multipath walk with attached root. * When walking parent we need to make sure we @@ -212,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = ERR_PTR(n); goto err_out; } + old_fid = fid; clone = 1; i = 0; while (i < n) { @@ -221,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, * walk to ensure none of the patch component change */ fid = p9_client_walk(fid, l, &wnames[i], clone); + /* non-cloning walk will return the same fid */ + if (fid != old_fid) { + p9_client_clunk(old_fid); + old_fid = fid; + } if (IS_ERR(fid)) { - if (old_fid) { - /* - * If we fail, clunk fid which are mapping - * to path component and not the last component - * of the path. - */ - p9_client_clunk(old_fid); - } kfree(wnames); goto err_out; } - old_fid = fid; i += l; clone = 0; } diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index a8f512b44a85..d0833fa69faf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -58,8 +58,21 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { + struct inode *inode = file_inode(file); + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid = file->private_data; + BUG_ON(!fid); + + /* we might need to read from a fid that was opened write-only + * for read-modify-write of page cache, use the writeback fid + * for that */ + if (rreq->origin == NETFS_READ_FOR_WRITE && + (fid->mode & O_ACCMODE) == O_WRONLY) { + fid = v9inode->writeback_fid; + BUG_ON(!fid); + } + refcount_inc(&fid->count); rreq->netfs_priv = fid; return 0; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 419d2f3cf2c2..3d8297714772 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1251,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); v9ses = v9fs_dentry2v9ses(dentry); - fid = v9fs_fid_lookup(dentry); + if (!v9fs_proto_dotu(v9ses)) + return ERR_PTR(-EBADF); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); - if (!v9fs_proto_dotu(v9ses)) - return ERR_PTR(-EBADF); - st = p9_client_stat(fid); p9_client_clunk(fid); if (IS_ERR(st)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index d17502a738a9..b6eb1160296c 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_client_clunk(dfid); goto out; } @@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", err); + p9_client_clunk(dfid); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), @@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", err); + p9_client_clunk(dfid); goto error; } v9fs_invalidate_inode_attr(dir); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 89630acbc2cc..64dab70d4a4f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -745,7 +745,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); - if (!(query_flags & AT_STATX_DONT_SYNC) && + if (vnode->volume && + !(query_flags & AT_STATX_DONT_SYNC) && !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89e94ea2fef5..4ba005c41983 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4632,6 +4632,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4672,8 +4683,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b1fdc6a26c76..6627dd7875ee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -763,6 +763,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force = false; no_compress++; } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -821,8 +823,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_thread_pool: ret = match_int(&args[0], &intarg); if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); goto out; } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); ret = -EINVAL; goto out; } @@ -883,8 +888,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); goto out; + } info->metadata_ratio = intarg; btrfs_info(info, "metadata ratio %u", info->metadata_ratio); @@ -901,6 +909,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, DISCARD_ASYNC, "turning on async discard"); } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -933,6 +943,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1014,8 +1026,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); goto out; + } info->check_integrity_print_mask = intarg; btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); @@ -1030,13 +1046,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) + if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) + } else if (strcmp(args[0].from, "bug") == 0) { btrfs_clear_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else { + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1044,8 +1062,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; goto out; + } if (intarg == 0) { btrfs_info(info, "using default commit interval %us", @@ -1059,8 +1081,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_rescue: ret = parse_rescue_options(info, args[0].from); - if (ret < 0) + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); goto out; + } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -1985,6 +2010,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + /* V1 cache is not supported for subpage mount. */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + ret = -EINVAL; + goto restore; + } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 0bece97547d4..d417de354d9d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -81,6 +81,9 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, } /* If we didn't find the channel, it is likely a bug */ + if (server) + cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", + server->conn_id); WARN_ON(1); return 0; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index eaf975f1ad89..b515140bad8d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -5154,6 +5154,8 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, data = &info; size = sizeof(struct smb2_file_eof_info); + trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof)); + return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, 1, &data, &size); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 2be5e0c8564d..6b88dc2e364f 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -121,6 +121,44 @@ DEFINE_SMB3_RW_DONE_EVENT(query_dir_done); DEFINE_SMB3_RW_DONE_EVENT(zero_done); DEFINE_SMB3_RW_DONE_EVENT(falloc_done); +/* For logging successful set EOF (truncate) */ +DECLARE_EVENT_CLASS(smb3_eof_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u64 offset), + TP_ARGS(xid, fid, tid, sesid, offset), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, offset) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->offset = offset; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->offset) +) + +#define DEFINE_SMB3_EOF_EVENT(name) \ +DEFINE_EVENT(smb3_eof_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 offset), \ + TP_ARGS(xid, fid, tid, sesid, offset)) + +DEFINE_SMB3_EOF_EVENT(set_eof); + /* * For handle based calls other than read and write, and get/set info */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3dce7d058985..84c0eb55071d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -829,7 +829,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); } /* Maximum number of blocks we map for direct IO at once. */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9f12f29bc346..9e06334771a3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4104,6 +4104,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = size >> bsbits; start = start_off >> bsbits; + /* + * For tiny groups (smaller than 8MB) the chosen allocation + * alignment may be larger than group size. Make sure the + * alignment does not move allocation to a different group which + * makes mballoc fail assertions later. + */ + start = max(start, rounddown(ac->ac_o_ex.fe_logical, + (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); + /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; @@ -4176,7 +4185,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } rcu_read_unlock(); - if (start + size <= ac->ac_o_ex.fe_logical && + /* + * In this function "start" and "size" are normalized for better + * alignment and length such that we could preallocate more blocks. + * This normalization is done such that original request of + * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and + * "size" boundaries. + * (Note fe_len can be relaxed since FS block allocation API does not + * provide gurantee on number of contiguous blocks allocation since that + * depends upon free space left, etc). + * In case of inode pa, later we use the allocated blocks + * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated + * range of goal/best blocks [start, size] to put it at the + * ac_o_ex.fe_logical extent of this inode. + * (See ext4_mb_use_inode_pa() for more details) + */ + if (start + size <= ac->ac_o_ex.fe_logical || start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7a5353a8cfd7..42f590518b4c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -438,7 +438,7 @@ int ext4_ext_migrate(struct inode *inode) /* * Worst case we can touch the allocation bitmaps and a block - * group descriptor block. We do need need to worry about + * group descriptor block. We do need to worry about * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 47d0ca4c795b..db4ba99d1ceb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1929,7 +1929,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; + unsigned continued; + int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 14695e2b5042..97fa7b4c645f 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -465,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* * In the first loop we prepare and mark buffers to submit. We have to * mark all buffers in the page before submitting so that - * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * end_page_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 90a941d20dff..8b70a4701293 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -54,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; /* + * If the reserved GDT blocks is non-zero, the resize_inode feature + * should always be set. + */ + if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && + !ext4_has_feature_resize_inode(sb)) { + ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); + return -EFSCORRUPTED; + } + + /* * If we are not using the primary superblock/GDT copy don't resize, * because the user tools have no way of handling this. Probably a * bad time to do it anyways. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 450c918d68fc..845f2f8aee5f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -87,7 +87,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb); +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); @@ -1870,31 +1870,12 @@ ext4_sb_read_encoding(const struct ext4_super_block *es) } #endif -static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) -{ -#ifdef CONFIG_FS_ENCRYPTION - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - - err = fscrypt_set_test_dummy_encryption(sb, arg, - &sbi->s_dummy_enc_policy); - if (err) { - ext4_msg(sb, KERN_WARNING, - "Error while setting test dummy encryption [%d]", err); - return err; - } - ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); -#endif - return 0; -} - #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) -#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) @@ -1911,7 +1892,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; - char *test_dummy_enc_arg; + struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; @@ -1953,7 +1934,7 @@ static void ext4_fc_free(struct fs_context *fc) for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); - kfree(ctx->test_dummy_enc_arg); + fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } @@ -2029,6 +2010,29 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) } #endif +static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, + struct ext4_fs_context *ctx) +{ + int err; + + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + ext4_msg(NULL, KERN_WARNING, + "test_dummy_encryption option not supported"); + return -EINVAL; + } + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->dummy_enc_policy); + if (err == -EINVAL) { + ext4_msg(NULL, KERN_WARNING, + "Value of option \"%s\" is unrecognized", param->key); + } else if (err == -EEXIST) { + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + return err; +} + #define EXT4_SET_CTX(name) \ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ @@ -2291,29 +2295,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (param->type == fs_value_is_flag) { - ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; - ctx->test_dummy_enc_arg = NULL; - return 0; - } - if (*param->string && - !(!strcmp(param->string, "v1") || - !strcmp(param->string, "v2"))) { - ext4_msg(NULL, KERN_WARNING, - "Value of option \"%s\" is unrecognized", - param->key); - return -EINVAL; - } - ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; - ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, - GFP_KERNEL); - return 0; -#else - ext4_msg(NULL, KERN_WARNING, - "test_dummy_encryption option not supported"); - return -EINVAL; -#endif + return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX @@ -2504,7 +2486,8 @@ parse_failed: if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; - ret = ext4_apply_options(fc, sb); + ext4_apply_options(fc, sb); + ret = 0; out_free: if (fc) { @@ -2673,11 +2656,11 @@ err_jquota_specified: static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { -#ifdef CONFIG_FS_ENCRYPTION const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; - if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)) + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { @@ -2691,14 +2674,46 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && - !sbi->s_dummy_enc_policy.policy) { + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; ext4_msg(NULL, KERN_WARNING, - "Can't set test_dummy_encryption on remount"); + "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } -#endif /* CONFIG_FS_ENCRYPTION */ - return 0; + /* Also make sure s_mount_opts didn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + /* + * fscrypt_add_test_dummy_key() technically changes the super_block, so + * technically it should be delayed until ext4_apply_options() like the + * other changes. But since we never get here for remounts (see above), + * and this is the last chance to report errors, we do it here. + */ + err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy); + if (err) + ext4_msg(NULL, KERN_WARNING, + "Error adding test dummy encryption key [%d]", err); + return err; +} + +static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, + struct super_block *sb) +{ + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) + return; + EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; + memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, @@ -2785,11 +2800,10 @@ fail_dax_change_remount: return ext4_check_quota_consistency(fc, sb); } -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; - int ret = 0; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; @@ -2825,11 +2839,7 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) #endif ext4_apply_quota_options(fc, sb); - - if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) - ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg); - - return ret; + ext4_apply_test_dummy_encryption(ctx, sb); } @@ -4552,9 +4562,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err < 0) goto failed_mount; - err = ext4_apply_options(fc, sb); - if (err < 0) - goto failed_mount; + ext4_apply_options(fc, sb); #if IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { @@ -5302,14 +5310,6 @@ no_journal: err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } - /* - * Update the checksum after updating free space/inode - * counters. Otherwise the superblock can have an incorrect - * checksum in the buffer cache until it is written out and - * e2fsprogs programs trying to open a file system immediately - * after it is mounted can fail. - */ - ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); @@ -5367,6 +5367,14 @@ no_journal: EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + /* + * Update the checksum after updating free space/inode counters and + * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); @@ -5898,7 +5906,6 @@ static void ext4_update_super(struct super_block *sb) static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; - int error = 0; if (!sbh) return -EINVAL; @@ -5907,6 +5914,13 @@ static int ext4_commit_super(struct super_block *sb) ext4_update_super(sb); + lock_buffer(sbh); + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(sbh)) { + unlock_buffer(sbh); + return -EIO; + } + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -5921,17 +5935,21 @@ static int ext4_commit_super(struct super_block *sb) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } - BUFFER_TRACE(sbh, "marking dirty"); - mark_buffer_dirty(sbh); - error = __sync_dirty_buffer(sbh, - REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); + get_bh(sbh); + /* Clear potential dirty bit if it was journalled update */ + clear_buffer_dirty(sbh); + sbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, + REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); + wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); + return -EIO; } - return error; + return 0; } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 042325349098..564e28a1aa94 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1895,11 +1895,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; - memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e49bb0938376..e9c308ae475f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2114,7 +2114,7 @@ out: /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation - * @page: to try and free + * @folio: Folio to detach data from. * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c8520284dda7..c1eda73254e1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -288,6 +288,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, rv = NFS4_OK; break; case -ENOENT: + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); /* Embrace your forgetfulness! */ rv = NFS4ERR_NOMATCHING_LAYOUT; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a8ecdd527662..0c4e8dd6aa96 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2124,6 +2124,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, } goto out; } + file->f_mode |= FMODE_CAN_ODIRECT; err = nfs_finish_open(ctx, ctx->dentry, file, open_flags); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 03d3a270eff4..e88f6b18445e 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -93,6 +93,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_file_set_open_context(filp, ctx); nfs_fscache_open_file(inode, filp); err = 0; + filp->f_mode |= FMODE_CAN_ODIRECT; out_put_ctx: put_nfs_open_context(ctx); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 68a87be3e6f9..41a9b6b58fb9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -469,6 +469,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, pnfs_clear_lseg_state(lseg, lseg_list); pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) pnfs_clear_layoutreturn_waitbit(lo); @@ -1917,8 +1918,9 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { - if (atomic_dec_and_test(&lo->plh_outstanding)) - wake_up_var(&lo->plh_outstanding); + if (atomic_dec_and_test(&lo->plh_outstanding) && + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); } static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) @@ -2025,11 +2027,11 @@ lookup_again: * If the layout segment list is empty, but there are outstanding * layoutget calls, then they might be subject to a layoutrecall. */ - if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) && + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); - lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - !atomic_read(&lo->plh_outstanding))); + lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN, + TASK_KILLABLE)); if (IS_ERR(lseg)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); @@ -2152,6 +2154,12 @@ lookup_again: case -ERECALLCONFLICT: case -EAGAIN: break; + case -ENODATA: + /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */ + pnfs_layout_set_fail_bit( + lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + goto out_put_layout_hdr; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); @@ -2407,7 +2415,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo)) + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && + !pnfs_is_first_layoutget(lo)) goto out_forget; if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 07f11489e4e9..f331f067691b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -105,6 +105,7 @@ enum { NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ NFS_LAYOUT_HASHED, /* The layout visible */ + NFS_LAYOUT_DRAIN, }; enum layoutdriver_policy_flags { diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de7252715b12..81d26abf486f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -553,7 +553,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) * * Only one instances directory is allowed. * - * The instances directory is special as it allows for mkdir and rmdir to + * The instances directory is special as it allows for mkdir and rmdir * to be done by userspace. When a mkdir or rmdir is performed, the inode * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 836ab1b8ed7b..1824f61621a2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -997,9 +997,11 @@ xfs_attr_set( /* * We have no control over the attribute names that userspace passes us * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. + * removal to fail as well. Preserve the logged flag, since we need + * to pass that through to the logging code. */ - args->op_flags = XFS_DA_OP_OKNOENT; + args->op_flags = XFS_DA_OP_OKNOENT | + (args->op_flags & XFS_DA_OP_LOGGED); if (args->value) { XFS_STATS_INC(mp, xs_attr_set); @@ -1439,12 +1441,11 @@ static int xfs_attr_node_try_addname( struct xfs_attr_intent *attr) { - struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; int error; - trace_xfs_attr_node_addname(args); + trace_xfs_attr_node_addname(state->args); blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index e329da3e7afa..b4a2fc77017e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -28,16 +28,6 @@ struct xfs_attr_list_context; */ #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ -static inline bool xfs_has_larp(struct xfs_mount *mp) -{ -#ifdef DEBUG - /* Logged xattrs require a V5 super for log_incompat */ - return xfs_has_crc(mp) && xfs_globals.larp; -#else - return false; -#endif -} - /* * Kernel-internal version of the attrlist cursor. */ @@ -624,7 +614,7 @@ static inline enum xfs_delattr_state xfs_attr_init_replace_state(struct xfs_da_args *args) { args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; - if (xfs_has_larp(args->dp->i_mount)) + if (args->op_flags & XFS_DA_OP_LOGGED) return xfs_attr_init_remove_state(args); return xfs_attr_init_add_state(args); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 15a990409463..37e7c33f6283 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1530,7 +1530,7 @@ xfs_attr3_leaf_add_work( if (tmp) entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_REPLACE) { - if (!xfs_has_larp(mp)) + if (!(args->op_flags & XFS_DA_OP_LOGGED)) entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index d33b7686a0b3..ffa3df5b2893 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -92,6 +92,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ #define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ #define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -101,7 +102,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ { XFS_DA_OP_NOTIME, "NOTIME" }, \ { XFS_DA_OP_REMOVE, "REMOVE" }, \ - { XFS_DA_OP_RECOVERY, "RECOVERY" } + { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ + { XFS_DA_OP_LOGGED, "LOGGED" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 4a28c2d77070..135d44133477 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -413,18 +413,20 @@ xfs_attr_create_intent( struct xfs_mount *mp = tp->t_mountp; struct xfs_attri_log_item *attrip; struct xfs_attr_intent *attr; + struct xfs_da_args *args; ASSERT(count == 1); - if (!xfs_sb_version_haslogxattrs(&mp->m_sb)) - return NULL; - /* * Each attr item only performs one attribute operation at a time, so * this is a list of one */ attr = list_first_entry_or_null(items, struct xfs_attr_intent, xattri_list); + args = attr->xattri_da_args; + + if (!(args->op_flags & XFS_DA_OP_LOGGED)) + return NULL; /* * Create a buffer to store the attribute name and value. This buffer @@ -432,8 +434,6 @@ xfs_attr_create_intent( * and the lower level xattr log items. */ if (!attr->xattri_nameval) { - struct xfs_da_args *args = attr->xattri_da_args; - /* * Transfer our reference to the name/value buffer to the * deferred work state structure. @@ -617,7 +617,10 @@ xfs_attri_item_recover( args->namelen = nv->name.i_len; args->hashval = xfs_da_hashname(args->name, args->namelen); args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; - args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; + args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | + XFS_DA_OP_LOGGED; + + ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); switch (attr->xattri_op_flags) { case XFS_ATTRI_OP_FLAGS_SET: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5a364a7d58fd..0d67ff8a8961 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1096,7 +1096,8 @@ xfs_flags2diflags2( { uint64_t di_flags2 = (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | - XFS_DIFLAG2_BIGTIME)); + XFS_DIFLAG2_BIGTIME | + XFS_DIFLAG2_NREXT64)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 35e13e125ec6..c325a28b89a8 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -68,6 +68,18 @@ xfs_attr_rele_log_assist( xlog_drop_incompat_feat(mp->m_log); } +static inline bool +xfs_attr_want_log_assist( + struct xfs_mount *mp) +{ +#ifdef DEBUG + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; +#else + return false; +#endif +} + /* * Set or remove an xattr, having grabbed the appropriate logging resources * prior to calling libxfs. @@ -80,11 +92,14 @@ xfs_attr_change( bool use_logging = false; int error; - if (xfs_has_larp(mp)) { + ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED)); + + if (xfs_attr_want_log_assist(mp)) { error = xfs_attr_grab_log_assist(mp); if (error) return error; + args->op_flags |= XFS_DA_OP_LOGGED; use_logging = true; } diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h index 6c5d4963e15b..69a13e1e5b2e 100644 --- a/include/keys/asymmetric-type.h +++ b/include/keys/asymmetric-type.h @@ -84,6 +84,9 @@ extern struct key *find_asymmetric_key(struct key *keyring, const struct asymmetric_key_id *id_2, bool partial); +int x509_load_certificate_list(const u8 cert_list[], const unsigned long list_size, + const struct key *keyring); + /* * The payload is at the discretion of the subtype. */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 6491fa8fba6d..15b940ec1eac 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -143,6 +143,12 @@ struct unwind_hint { .popsection .endm +.macro STACK_FRAME_NON_STANDARD_FP func:req +#ifdef CONFIG_FRAME_POINTER + STACK_FRAME_NON_STANDARD \func +#endif +.endm + .macro ANNOTATE_NOENDBR .Lhere_\@: .pushsection .discard.noendbr diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index c21c7f8103e2..002266693e50 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -23,12 +23,16 @@ struct ratelimit_state { unsigned long flags; }; -#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) { \ - .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ - .interval = interval_init, \ - .burst = burst_init, \ +#define RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, flags_init) { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .interval = interval_init, \ + .burst = burst_init, \ + .flags = flags_init, \ } +#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) \ + RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, 0) + #define RATELIMIT_STATE_INIT_DISABLED \ RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index c1b5dcd6597c..daead5fb389a 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -253,6 +253,11 @@ struct inet_sock { #define IP_CMSG_CHECKSUM BIT(7) #define IP_CMSG_RECVFRAGSIZE BIT(8) +static inline bool sk_is_inet(struct sock *sk) +{ + return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; +} + /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 63d0ac7dfe2f..eb12d4f705cc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4815,6 +4815,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env, n = btf_nr_types(btf); for (i = start_id; i < n; i++) { const struct btf_type *t; + int chain_limit = 32; u32 cur_id = i; t = btf_type_by_id(btf, i); @@ -4827,6 +4828,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, in_tags = btf_type_is_type_tag(t); while (btf_type_is_modifier(t)) { + if (!chain_limit--) { + btf_verifier_log(env, "Max chain length or cycle detected"); + return -ELOOP; + } if (btf_type_is_type_tag(t)) { if (!in_tags) { btf_verifier_log(env, "Type tags don't precede modifiers"); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e6b8e564b37f..886789dcee43 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1006,8 +1006,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); - if (is_chained) + if (is_chained) { desc->action = NULL; + WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc))); + } desc->depth = 1; } desc->handle_irq = handle; @@ -1033,6 +1035,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; + WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc))); irq_activate_and_startup(desc, IRQ_RESEND); } } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81e87280513e..f06b91ca6482 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5432,7 +5432,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) * be guessable and still allows some pin nesting in * our u32 pin_count. */ - cookie.val = 1 + (prandom_u32() >> 16); + cookie.val = 1 + (sched_clock() & 0xffff); hlock->pin_count += cookie.val; return cookie; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 20a66bf9f465..89c71fce225d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -665,7 +665,7 @@ static void power_down(void) hibernation_platform_enter(); fallthrough; case HIBERNATION_SHUTDOWN: - if (pm_power_off) + if (kernel_can_power_off()) kernel_power_off(); break; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfa7452ca92e..da0bf6fe9ecd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4798,25 +4798,55 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) static void balance_push(struct rq *rq); +/* + * balance_push_callback is a right abuse of the callback interface and plays + * by significantly different rules. + * + * Where the normal balance_callback's purpose is to be ran in the same context + * that queued it (only later, when it's safe to drop rq->lock again), + * balance_push_callback is specifically targeted at __schedule(). + * + * This abuse is tolerated because it places all the unlikely/odd cases behind + * a single test, namely: rq->balance_callback == NULL. + */ struct callback_head balance_push_callback = { .next = NULL, .func = (void (*)(struct callback_head *))balance_push, }; -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct callback_head * +__splice_balance_callbacks(struct rq *rq, bool split) { struct callback_head *head = rq->balance_callback; + if (likely(!head)) + return NULL; + lockdep_assert_rq_held(rq); - if (head) + /* + * Must not take balance_push_callback off the list when + * splice_balance_callbacks() and balance_callbacks() are not + * in the same rq->lock section. + * + * In that case it would be possible for __schedule() to interleave + * and observe the list empty. + */ + if (split && head == &balance_push_callback) + head = NULL; + else rq->balance_callback = NULL; return head; } +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + return __splice_balance_callbacks(rq, true); +} + static void __balance_callbacks(struct rq *rq) { - do_balance_callbacks(rq, splice_balance_callbacks(rq)); + do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } static inline void balance_callbacks(struct rq *rq, struct callback_head *head) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 01259611beb9..47b89a0fc6e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1693,6 +1693,11 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_rq_held(rq); + /* + * Don't (re)queue an already queued item; nor queue anything when + * balance_push() is active, see the comment with + * balance_push_callback. + */ if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7a13e6ac6327..88589d74a892 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2423,7 +2423,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, kprobe_multi_link_prog_run(link, entry_ip, regs); } -static int symbols_cmp(const void *a, const void *b) +static int symbols_cmp_r(const void *a, const void *b, const void *priv) { const char **str_a = (const char **) a; const char **str_b = (const char **) b; @@ -2431,6 +2431,28 @@ static int symbols_cmp(const void *a, const void *b) return strcmp(*str_a, *str_b); } +struct multi_symbols_sort { + const char **funcs; + u64 *cookies; +}; + +static void symbols_swap_r(void *a, void *b, int size, const void *priv) +{ + const struct multi_symbols_sort *data = priv; + const char **name_a = a, **name_b = b; + + swap(*name_a, *name_b); + + /* If defined, swap also related cookies. */ + if (data->cookies) { + u64 *cookie_a, *cookie_b; + + cookie_a = data->cookies + (name_a - data->funcs); + cookie_b = data->cookies + (name_b - data->funcs); + swap(*cookie_a, *cookie_b); + } +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2468,38 +2490,46 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!addrs) return -ENOMEM; + ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); + if (ucookies) { + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, size)) { + err = -EFAULT; + goto error; + } + } + if (uaddrs) { if (copy_from_user(addrs, uaddrs, size)) { err = -EFAULT; goto error; } } else { + struct multi_symbols_sort data = { + .cookies = cookies, + }; struct user_syms us; err = copy_user_syms(&us, usyms, cnt); if (err) goto error; - sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL); + if (cookies) + data.funcs = us.syms; + + sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, + symbols_swap_r, &data); + err = ftrace_lookup_symbols(us.syms, cnt, addrs); free_user_syms(&us); if (err) goto error; } - ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); - if (ucookies) { - cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); - if (!cookies) { - err = -ENOMEM; - goto error; - } - if (copy_from_user(cookies, ucookies, size)) { - err = -EFAULT; - goto error; - } - } - link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e750fe141a60..601ccf1b2f09 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8029,15 +8029,23 @@ static int kallsyms_callback(void *data, const char *name, struct module *mod, unsigned long addr) { struct kallsyms_data *args = data; + const char **sym; + int idx; - if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp)) + sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp); + if (!sym) + return 0; + + idx = sym - args->syms; + if (args->addrs[idx]) return 0; addr = ftrace_location(addr); if (!addr) return 0; - args->addrs[args->found++] = addr; + args->addrs[idx] = addr; + args->found++; return args->found == args->cnt ? 1 : 0; } @@ -8062,6 +8070,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a struct kallsyms_data args; int err; + memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; args.cnt = cnt; diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index b56833700d23..c69d82273ce7 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; + /* + * This expects the caller will set up a rethook on a function entry. + * When the function returns, the rethook will eventually be reclaimed + * or released in the rethook_recycle() with call_rcu(). + * This means the caller must be run in the RCU-availabe context. + */ + if (unlikely(!rcu_is_watching())) + return NULL; + fn = freelist_try_get(&rh->pool); if (!fn) return NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c95992e2c71..a8cfac0611bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6424,9 +6424,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) synchronize_rcu(); free_snapshot(tr); } -#endif -#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 93507330462c..a245ea673715 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1718,8 +1718,17 @@ static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); - struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return 0; + tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..c3dc4f859a6b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -546,7 +546,6 @@ static int __trace_uprobe_create(int argc, const char **argv) bool is_return = false; int i, ret; - ret = 0; ref_ctr_offset = 0; switch (argv[0][0]) { diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index c4fe15d38b60..a9f7eb047768 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -94,7 +94,7 @@ config UBSAN_UNREACHABLE bool "Perform checking for unreachable code" # objtool already handles unreachable checking and gets angry about # seeing UBSan instrumentation located in unreachable places. - depends on !(OBJTOOL && (STACK_VALIDATION || UNWINDER_ORC || X86_SMAP)) + depends on !(OBJTOOL && (STACK_VALIDATION || UNWINDER_ORC || HAVE_UACCESS_VALIDATION)) depends on $(cc-option,-fsanitize=unreachable) help This option enables -fsanitize=unreachable which checks for control diff --git a/mm/filemap.c b/mm/filemap.c index ac3775c1ce4c..ffdfbc8b0e3c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2385,6 +2385,8 @@ static void filemap_get_read_batch(struct address_space *mapping, continue; if (xas.xa_index > max || xa_is_value(folio)) break; + if (xa_is_sibling(folio)) + break; if (!folio_try_get_rcu(folio)) goto retry; @@ -2629,6 +2631,13 @@ err: return err; } +static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) +{ + unsigned int shift = folio_shift(folio); + + return (pos1 >> shift == pos2 >> shift); +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2700,11 +2709,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, writably_mapped = mapping_writably_mapped(mapping); /* - * When a sequential read accesses a page several times, only + * When a read accesses the same folio several times, only * mark it as accessed the first time. */ - if (iocb->ki_pos >> PAGE_SHIFT != - ra->prev_pos >> PAGE_SHIFT) + if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1, + fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7248002dad9..834f288b3769 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2377,6 +2377,7 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; + page_tail->private = 0; /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); diff --git a/mm/migrate.c b/mm/migrate.c index e51588e95f57..6c1ea61f39d8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1106,6 +1106,7 @@ static int unmap_and_move(new_page_t get_new_page, if (!newpage) return -ENOMEM; + newpage->private = 0; rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); diff --git a/mm/readahead.c b/mm/readahead.c index 57a015108254..fdcd28cbd92d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -510,6 +510,7 @@ void page_cache_ra_order(struct readahead_control *ractl, new_order--; } + filemap_invalidate_lock_shared(mapping); while (index <= limit) { unsigned int order = new_order; @@ -536,6 +537,7 @@ void page_cache_ra_order(struct readahead_control *ractl, } read_pages(ractl); + filemap_invalidate_unlock_shared(mapping); /* * If there were already pages in the page cache, then we may have diff --git a/mm/slub.c b/mm/slub.c index e5535020e0fd..b1281b8654bd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -726,25 +726,48 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } -static void noinline set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) -{ - struct track *p = get_track(s, object, alloc); - #ifdef CONFIG_STACKDEPOT +static noinline depot_stack_handle_t set_track_prepare(void) +{ + depot_stack_handle_t handle; unsigned long entries[TRACK_ADDRS_COUNT]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); - p->handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + + return handle; +} +#else +static inline depot_stack_handle_t set_track_prepare(void) +{ + return 0; +} #endif +static void set_track_update(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, + depot_stack_handle_t handle) +{ + struct track *p = get_track(s, object, alloc); + +#ifdef CONFIG_STACKDEPOT + p->handle = handle; +#endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; p->when = jiffies; } +static __always_inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) +{ + depot_stack_handle_t handle = set_track_prepare(); + + set_track_update(s, object, alloc, addr, handle); +} + static void init_tracking(struct kmem_cache *s, void *object) { struct track *p; @@ -1373,6 +1396,10 @@ static noinline int free_debug_processing( int cnt = 0; unsigned long flags, flags2; int ret = 0; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); spin_lock_irqsave(&n->list_lock, flags); slab_lock(slab, &flags2); @@ -1391,7 +1418,7 @@ next_object: } if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_FREE, addr); + set_track_update(s, object, TRACK_FREE, addr, handle); trace(s, slab, object, 0); /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ init_object(s, object, SLUB_RED_INACTIVE); @@ -2936,6 +2963,7 @@ redo: if (!freelist) { c->slab = NULL; + c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2968,6 +2996,7 @@ deactivate_slab: freelist = c->freelist; c->slab = NULL; c->freelist = NULL; + c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, slab, freelist); diff --git a/net/core/dev.c b/net/core/dev.c index 08ce317fcec8..8e6f22961206 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -397,16 +397,18 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev) +static void unlist_netdevice(struct net_device *dev, bool lock) { ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock(&dev_base_lock); + if (lock) + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock(&dev_base_lock); + if (lock) + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -10043,11 +10045,11 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) { - dev->reg_state = NETREG_UNREGISTERED; + write_lock(&dev_base_lock); + dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + write_unlock(&dev_base_lock); + if (ret) goto err_uninit; - } - dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); @@ -10329,7 +10331,9 @@ void netdev_run_todo(void) continue; } + write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; + write_unlock(&dev_base_lock); linkwatch_forget_dev(dev); } @@ -10810,9 +10814,10 @@ void unregister_netdevice_many(struct list_head *head) list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - unlist_netdevice(dev); - + write_lock(&dev_base_lock); + unlist_netdevice(dev, false); dev->reg_state = NETREG_UNREGISTERING; + write_unlock(&dev_base_lock); } flush_all_backlogs(); @@ -10959,7 +10964,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev); + unlist_netdevice(dev, true); synchronize_net(); diff --git a/net/core/filter.c b/net/core/filter.c index 5af58eb48587..5d16d66727fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6516,10 +6516,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex, proto, netns_id, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -6553,10 +6564,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e319e242dddf..a3642569fe53 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -33,6 +33,7 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; +/* Caller holds RTNL or dev_base_lock */ static inline int dev_isalive(const struct net_device *dev) { return dev->reg_state <= NETREG_REGISTERED; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 22b983ade0e7..b0fcd0200e84 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -699,6 +699,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); + if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 7e6b37a54add..1c94bb8ea03f 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -36,7 +36,7 @@ static int fallback_set_params(struct eeprom_req_info *request, if (request->page) offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset; - if (modinfo->type == ETH_MODULE_SFF_8079 && + if (modinfo->type == ETH_MODULE_SFF_8472 && request->i2c_address == 0x51) offset += ETH_MODULE_EEPROM_PAGE_LEN * 2; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 3b9cd487075a..5c58e21f724e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -524,7 +524,6 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) int tunnel_hlen; int version; int nhoff; - int thoff; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -558,10 +557,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; - thoff = skb_transport_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IPV6) && - (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) - truncate = true; + if (skb->protocol == htons(ETH_P_IPV6)) { + int thoff; + + if (skb_transport_header_was_set(skb)) + thoff = skb_transport_header(skb) - skb_mac_header(skb); + else + thoff = nhoff + sizeof(struct ipv6hdr); + if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) + truncate = true; + } if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 1a43ca73f94d..3c6101def7d6 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -319,12 +319,16 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); + if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) + return 0; + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); - if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk), - addr->sin_addr.s_addr, - chk_addr_ret)) + if (chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST || + (chk_addr_ret != RTN_LOCAL && + !inet_can_nonlocal_bind(net, isk))) return -EADDRNOTAVAIL; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index be3947e70fec..0d3f68bb51c0 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -611,9 +611,6 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) return 0; } - if (inet_csk_has_ulp(sk)) - return -EINVAL; - if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return -EINVAL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4e37f7c29900..a9051df0625d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -939,7 +939,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, __be16 proto; __u32 mtu; int nhoff; - int thoff; if (!pskb_inet_may_pull(skb)) goto tx_err; @@ -960,10 +959,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; - thoff = skb_transport_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IPV6) && - (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) - truncate = true; + if (skb->protocol == htons(ETH_P_IPV6)) { + int thoff; + + if (skb_transport_header_was_set(skb)) + thoff = skb_transport_header(skb) - skb_mac_header(skb); + else + thoff = nhoff + sizeof(struct ipv6hdr); + if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) + truncate = true; + } if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 7873bd1389c3..a8e2425e43b0 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -13,14 +13,31 @@ #include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> -static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) +#define NF_RECURSION_LIMIT 2 + +static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); + +static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, + enum nf_dev_hooks hook) { - if (skb_mac_header_was_set(skb)) + if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + goto err; + + if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { + if (skb_cow_head(skb, skb->mac_len)) + goto err; + skb_push(skb, skb->mac_len); + } skb->dev = dev; skb_clear_tstamp(skb); + __this_cpu_inc(nf_dup_skb_recursion); dev_queue_xmit(skb); + __this_cpu_dec(nf_dup_skb_recursion); + return; +err: + kfree_skb(skb); } void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif) @@ -33,7 +50,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif) return; } - nf_do_netdev_egress(pkt->skb, dev); + nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt)); } EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress); @@ -48,7 +65,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) skb = skb_clone(pkt->skb, GFP_ATOMIC); if (skb) - nf_do_netdev_egress(skb, dev); + nf_do_netdev_egress(skb, dev, nft_hook(pkt)); } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index af15102bc696..f466af4f8531 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -614,7 +614,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) nf_ct_untimeout(net, NULL); - list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, head) { + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) { list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index ac4859241e17..55d2d49c3425 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -14,6 +14,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/random.h> #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> @@ -32,8 +33,6 @@ #define NFT_META_SECS_PER_DAY 86400 #define NFT_META_DAYS_PER_WEEK 7 -static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); - static u8 nft_meta_weekday(void) { time64_t secs = ktime_get_real_seconds(); @@ -271,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, return true; } -static noinline u32 nft_prandom_u32(void) -{ - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - - return prandom_u32_state(state); -} - #ifdef CONFIG_IP_ROUTE_CLASSID static noinline bool nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) @@ -389,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; #endif case NFT_META_PRANDOM: - *dest = nft_prandom_u32(); + *dest = get_random_u32(); break; #ifdef CONFIG_XFRM case NFT_META_SECPATH: @@ -518,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx, len = IFNAMSIZ; break; case NFT_META_PRANDOM: - prandom_init_once(&nft_prandom_state); len = sizeof(u32); break; #ifdef CONFIG_XFRM diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 81b40c663d86..45d3dc9e96f2 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -9,12 +9,11 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/random.h> #include <linux/static_key.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> -static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); - struct nft_ng_inc { u8 dreg; u32 modulus; @@ -135,12 +134,9 @@ struct nft_ng_random { u32 offset; }; -static u32 nft_ng_random_gen(struct nft_ng_random *priv) +static u32 nft_ng_random_gen(const struct nft_ng_random *priv) { - struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); - - return reciprocal_scale(prandom_u32_state(state), priv->modulus) + - priv->offset; + return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset; } static void nft_ng_random_eval(const struct nft_expr *expr, @@ -168,8 +164,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - prandom_init_once(&nft_numgen_prandom_state); - return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, sizeof(u32)); } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 372bf54a0ca9..e20d1a973417 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -407,7 +407,7 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) if (flags & IP6_FH_F_FRAG) { if (frag_off) { key->ip.frag = OVS_FRAG_TYPE_LATER; - key->ip.proto = nexthdr; + key->ip.proto = NEXTHDR_FRAGMENT; return 0; } key->ip.frag = OVS_FRAG_TYPE_FIRST; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index ed4ccef5d6a8..5449ed114e40 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1146,9 +1146,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_rate rate; struct tc_netem_slot slot; - qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency), + qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), UINT_MAX); - qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter), + qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e2c6eca0271b..b6781ada3aa8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -651,6 +651,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; new->cl_principal = clnt->cl_principal; + new->cl_max_connect = clnt->cl_max_connect; return new; out_err: diff --git a/net/tipc/core.c b/net/tipc/core.c index 3f4542e0f065..434e70eabe08 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -109,10 +109,9 @@ static void __net_exit tipc_exit_net(struct net *net) struct tipc_net *tn = tipc_net(net); tipc_detach_loopback(net); + tipc_net_stop(net); /* Make sure the tipc_net_finalize_work() finished */ cancel_work_sync(&tn->work); - tipc_net_stop(net); - tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index da176411c1b5..2ffede463e4a 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -921,6 +921,8 @@ static void tls_update(struct sock *sk, struct proto *p, { struct tls_context *ctx; + WARN_ON_ONCE(sk->sk_prot == p); + ctx = tls_get_ctx(sk); if (likely(ctx)) { ctx->sk_write_space = write_space; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 19ac872a6624..09002387987e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -538,12 +538,6 @@ static int xsk_generic_xmit(struct sock *sk) goto out; } - skb = xsk_build_skb(xs, &desc); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - goto out; - } - /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement @@ -552,11 +546,19 @@ static int xsk_generic_xmit(struct sock *sk) spin_lock_irqsave(&xs->pool->cq_lock, flags); if (xskq_prod_reserve(xs->pool->cq)) { spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - kfree_skb(skb); goto out; } spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel(xs->pool->cq); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + goto out; + } + err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c index 24d3cf109140..01ee6c8c8382 100644 --- a/samples/fprobe/fprobe_example.c +++ b/samples/fprobe/fprobe_example.c @@ -21,6 +21,7 @@ #define BACKTRACE_DEPTH 16 #define MAX_SYMBOL_LEN 4096 struct fprobe sample_probe; +static unsigned long nhit; static char symbol[MAX_SYMBOL_LEN] = "kernel_clone"; module_param_string(symbol, symbol, sizeof(symbol), 0644); @@ -28,6 +29,8 @@ static char nosymbol[MAX_SYMBOL_LEN] = ""; module_param_string(nosymbol, nosymbol, sizeof(nosymbol), 0644); static bool stackdump = true; module_param(stackdump, bool, 0644); +static bool use_trace = false; +module_param(use_trace, bool, 0644); static void show_backtrace(void) { @@ -40,7 +43,15 @@ static void show_backtrace(void) static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs) { - pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + else + pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + nhit++; if (stackdump) show_backtrace(); } @@ -49,8 +60,17 @@ static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_r { unsigned long rip = instruction_pointer(regs); - pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", - (void *)ip, (void *)ip, (void *)rip, (void *)rip); + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + else + pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + nhit++; if (stackdump) show_backtrace(); } @@ -112,7 +132,8 @@ static void __exit fprobe_exit(void) { unregister_fprobe(&sample_probe); - pr_info("fprobe at %s unregistered\n", symbol); + pr_info("fprobe at %s unregistered. %ld times hit, %ld times missed\n", + symbol, nhit, sample_probe.nmissed); } module_init(fprobe_init) diff --git a/scripts/faddr2line b/scripts/faddr2line index 0e6268d59883..94ed98dd899f 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -95,17 +95,25 @@ __faddr2line() { local print_warnings=$4 local sym_name=${func_addr%+*} - local offset=${func_addr#*+} - offset=${offset%/*} + local func_offset=${func_addr#*+} + func_offset=${func_offset%/*} local user_size= + local file_type + local is_vmlinux=0 [[ $func_addr =~ "/" ]] && user_size=${func_addr#*/} - if [[ -z $sym_name ]] || [[ -z $offset ]] || [[ $sym_name = $func_addr ]]; then + if [[ -z $sym_name ]] || [[ -z $func_offset ]] || [[ $sym_name = $func_addr ]]; then warn "bad func+offset $func_addr" DONE=1 return fi + # vmlinux uses absolute addresses in the section table rather than + # section offsets. + local file_type=$(${READELF} --file-header $objfile | + ${AWK} '$1 == "Type:" { print $2; exit }') + [[ $file_type = "EXEC" ]] && is_vmlinux=1 + # Go through each of the object's symbols which match the func name. # In rare cases there might be duplicates, in which case we print all # matches. @@ -114,9 +122,11 @@ __faddr2line() { local sym_addr=0x${fields[1]} local sym_elf_size=${fields[2]} local sym_sec=${fields[6]} + local sec_size + local sec_name # Get the section size: - local sec_size=$(${READELF} --section-headers --wide $objfile | + sec_size=$(${READELF} --section-headers --wide $objfile | sed 's/\[ /\[/' | ${AWK} -v sec=$sym_sec '$1 == "[" sec "]" { print "0x" $6; exit }') @@ -126,6 +136,17 @@ __faddr2line() { return fi + # Get the section name: + sec_name=$(${READELF} --section-headers --wide $objfile | + sed 's/\[ /\[/' | + ${AWK} -v sec=$sym_sec '$1 == "[" sec "]" { print $2; exit }') + + if [[ -z $sec_name ]]; then + warn "bad section name: section: $sym_sec" + DONE=1 + return + fi + # Calculate the symbol size. # # Unfortunately we can't use the ELF size, because kallsyms @@ -174,10 +195,10 @@ __faddr2line() { sym_size=0x$(printf %x $sym_size) - # Calculate the section address from user-supplied offset: - local addr=$(($sym_addr + $offset)) + # Calculate the address from user-supplied offset: + local addr=$(($sym_addr + $func_offset)) if [[ -z $addr ]] || [[ $addr = 0 ]]; then - warn "bad address: $sym_addr + $offset" + warn "bad address: $sym_addr + $func_offset" DONE=1 return fi @@ -191,9 +212,9 @@ __faddr2line() { fi # Make sure the provided offset is within the symbol's range: - if [[ $offset -gt $sym_size ]]; then + if [[ $func_offset -gt $sym_size ]]; then [[ $print_warnings = 1 ]] && - echo "skipping $sym_name address at $addr due to size mismatch ($offset > $sym_size)" + echo "skipping $sym_name address at $addr due to size mismatch ($func_offset > $sym_size)" continue fi @@ -202,11 +223,13 @@ __faddr2line() { [[ $FIRST = 0 ]] && echo FIRST=0 - echo "$sym_name+$offset/$sym_size:" + echo "$sym_name+$func_offset/$sym_size:" # Pass section address to addr2line and strip absolute paths # from the output: - local output=$(${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;") + local args="--functions --pretty-print --inlines --exe=$objfile" + [[ $is_vmlinux = 0 ]] && args="$args --section=$sec_name" + local output=$(${ADDR2LINE} $args $addr | sed "s; $dir_prefix\(\./\)*; ;") [[ -z $output ]] && continue # Default output (non --list): diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 15dc7160ba34..8cfdaee77905 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -431,33 +431,17 @@ static const struct snd_malloc_ops snd_dma_iram_ops = { */ static void *snd_dma_dev_alloc(struct snd_dma_buffer *dmab, size_t size) { - void *p; - - p = dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); -#ifdef CONFIG_X86 - if (p && dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC) - set_memory_wc((unsigned long)p, PAGE_ALIGN(size) >> PAGE_SHIFT); -#endif - return p; + return dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_dev_free(struct snd_dma_buffer *dmab) { -#ifdef CONFIG_X86 - if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC) - set_memory_wb((unsigned long)dmab->area, - PAGE_ALIGN(dmab->bytes) >> PAGE_SHIFT); -#endif dma_free_coherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_dev_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { -#ifdef CONFIG_X86 - if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC) - area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); -#endif return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } @@ -471,10 +455,6 @@ static const struct snd_malloc_ops snd_dma_dev_ops = { /* * Write-combined pages */ -#ifdef CONFIG_X86 -/* On x86, share the same ops as the standard dev ops */ -#define snd_dma_wc_ops snd_dma_dev_ops -#else /* CONFIG_X86 */ static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_wc(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); @@ -497,7 +477,6 @@ static const struct snd_malloc_ops snd_dma_wc_ops = { .free = snd_dma_wc_free, .mmap = snd_dma_wc_mmap, }; -#endif /* CONFIG_X86 */ #ifdef CONFIG_SND_DMA_SGBUF static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size); diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c index 3f35972e1cf7..161a9711cd63 100644 --- a/sound/hda/hdac_i915.c +++ b/sound/hda/hdac_i915.c @@ -119,21 +119,18 @@ static int i915_component_master_match(struct device *dev, int subcomponent, /* check whether Intel graphics is present and reachable */ static int i915_gfx_present(struct pci_dev *hdac_pci) { - unsigned int class = PCI_BASE_CLASS_DISPLAY << 16; struct pci_dev *display_dev = NULL; - bool match = false; - do { - display_dev = pci_get_class(class, display_dev); - - if (display_dev && display_dev->vendor == PCI_VENDOR_ID_INTEL && + for_each_pci_dev(display_dev) { + if (display_dev->vendor == PCI_VENDOR_ID_INTEL && + (display_dev->class >> 16) == PCI_BASE_CLASS_DISPLAY && connectivity_check(display_dev, hdac_pci)) { pci_dev_put(display_dev); - match = true; + return true; } - } while (!match && display_dev); + } - return match; + return false; } /** diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index a8fe01764b25..ec9cbb219bc1 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -196,6 +196,12 @@ static const struct config_entry config_table[] = { DMI_MATCH(DMI_SYS_VENDOR, "Google"), } }, + { + .ident = "UP-WHL", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "AAEON"), + } + }, {} } }, @@ -358,6 +364,12 @@ static const struct config_entry config_table[] = { DMI_MATCH(DMI_SYS_VENDOR, "Google"), } }, + { + .ident = "UPX-TGL", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "AAEON"), + } + }, {} } }, diff --git a/sound/hda/intel-nhlt.c b/sound/hda/intel-nhlt.c index 4063da378283..9db5ccd9aa2d 100644 --- a/sound/hda/intel-nhlt.c +++ b/sound/hda/intel-nhlt.c @@ -55,8 +55,8 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) /* find max number of channels based on format_configuration */ if (fmt_configs->fmt_count) { - dev_dbg(dev, "%s: found %d format definitions\n", - __func__, fmt_configs->fmt_count); + dev_dbg(dev, "found %d format definitions\n", + fmt_configs->fmt_count); for (i = 0; i < fmt_configs->fmt_count; i++) { struct wav_fmt_ext *fmt_ext; @@ -66,9 +66,9 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) if (fmt_ext->fmt.channels > max_ch) max_ch = fmt_ext->fmt.channels; } - dev_dbg(dev, "%s: max channels found %d\n", __func__, max_ch); + dev_dbg(dev, "max channels found %d\n", max_ch); } else { - dev_dbg(dev, "%s: No format information found\n", __func__); + dev_dbg(dev, "No format information found\n"); } if (cfg->device_config.config_type != NHLT_CONFIG_TYPE_MIC_ARRAY) { @@ -95,17 +95,16 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) } if (dmic_geo > 0) { - dev_dbg(dev, "%s: Array with %d dmics\n", __func__, dmic_geo); + dev_dbg(dev, "Array with %d dmics\n", dmic_geo); } if (max_ch > dmic_geo) { - dev_dbg(dev, "%s: max channels %d exceed dmic number %d\n", - __func__, max_ch, dmic_geo); + dev_dbg(dev, "max channels %d exceed dmic number %d\n", + max_ch, dmic_geo); } } } - dev_dbg(dev, "%s: dmic number %d max_ch %d\n", - __func__, dmic_geo, max_ch); + dev_dbg(dev, "dmic number %d max_ch %d\n", dmic_geo, max_ch); return dmic_geo; } diff --git a/sound/pci/hda/hda_auto_parser.c b/sound/pci/hda/hda_auto_parser.c index cd1db943b7e0..7c6b1fe8dfcc 100644 --- a/sound/pci/hda/hda_auto_parser.c +++ b/sound/pci/hda/hda_auto_parser.c @@ -819,7 +819,7 @@ static void set_pin_targets(struct hda_codec *codec, snd_hda_set_pin_ctl_cache(codec, cfg->nid, cfg->val); } -static void apply_fixup(struct hda_codec *codec, int id, int action, int depth) +void __snd_hda_apply_fixup(struct hda_codec *codec, int id, int action, int depth) { const char *modelname = codec->fixup_name; @@ -829,7 +829,7 @@ static void apply_fixup(struct hda_codec *codec, int id, int action, int depth) if (++depth > 10) break; if (fix->chained_before) - apply_fixup(codec, fix->chain_id, action, depth + 1); + __snd_hda_apply_fixup(codec, fix->chain_id, action, depth + 1); switch (fix->type) { case HDA_FIXUP_PINS: @@ -870,6 +870,7 @@ static void apply_fixup(struct hda_codec *codec, int id, int action, int depth) id = fix->chain_id; } } +EXPORT_SYMBOL_GPL(__snd_hda_apply_fixup); /** * snd_hda_apply_fixup - Apply the fixup chain with the given action @@ -879,7 +880,7 @@ static void apply_fixup(struct hda_codec *codec, int id, int action, int depth) void snd_hda_apply_fixup(struct hda_codec *codec, int action) { if (codec->fixup_list) - apply_fixup(codec, codec->fixup_id, action, 0); + __snd_hda_apply_fixup(codec, codec->fixup_id, action, 0); } EXPORT_SYMBOL_GPL(snd_hda_apply_fixup); diff --git a/sound/pci/hda/hda_local.h b/sound/pci/hda/hda_local.h index aca592651870..682dca2057db 100644 --- a/sound/pci/hda/hda_local.h +++ b/sound/pci/hda/hda_local.h @@ -348,6 +348,7 @@ void snd_hda_apply_verbs(struct hda_codec *codec); void snd_hda_apply_pincfgs(struct hda_codec *codec, const struct hda_pintbl *cfg); void snd_hda_apply_fixup(struct hda_codec *codec, int action); +void __snd_hda_apply_fixup(struct hda_codec *codec, int id, int action, int depth); void snd_hda_pick_fixup(struct hda_codec *codec, const struct hda_model_fixup *models, const struct snd_pci_quirk *quirk, diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 1248d1a51cf0..3e541a4c0423 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1079,11 +1079,11 @@ static int patch_conexant_auto(struct hda_codec *codec) if (err < 0) goto error; - err = snd_hda_gen_parse_auto_config(codec, &spec->gen.autocfg); + err = cx_auto_parse_beep(codec); if (err < 0) goto error; - err = cx_auto_parse_beep(codec); + err = snd_hda_gen_parse_auto_config(codec, &spec->gen.autocfg); if (err < 0) goto error; diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index b0f954118e72..cee69fa7e246 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2634,6 +2634,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67f1, "Clevo PC70H[PRS]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x67f5, "Clevo PD70PN[NRT]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x70d1, "Clevo PC70[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170SM", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x7715, "Clevo X170KM-G", ALC1220_FIXUP_CLEVO_PB51ED), @@ -7004,6 +7005,7 @@ enum { ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS, ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE, ALC287_FIXUP_YOGA7_14ITL_SPEAKERS, + ALC298_FIXUP_LENOVO_C940_DUET7, ALC287_FIXUP_13S_GEN2_SPEAKERS, ALC256_FIXUP_SET_COEF_DEFAULTS, ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE, @@ -7022,6 +7024,23 @@ enum { ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE, }; +/* A special fixup for Lenovo C940 and Yoga Duet 7; + * both have the very same PCI SSID, and we need to apply different fixups + * depending on the codec ID + */ +static void alc298_fixup_lenovo_c940_duet7(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + int id; + + if (codec->core.vendor_id == 0x10ec0298) + id = ALC298_FIXUP_LENOVO_SPK_VOLUME; /* C940 */ + else + id = ALC287_FIXUP_YOGA7_14ITL_SPEAKERS; /* Duet 7 */ + __snd_hda_apply_fixup(codec, id, action, 0); +} + static const struct hda_fixup alc269_fixups[] = { [ALC269_FIXUP_GPIO2] = { .type = HDA_FIXUP_FUNC, @@ -8721,6 +8740,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE, }, + [ALC298_FIXUP_LENOVO_C940_DUET7] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc298_fixup_lenovo_c940_duet7, + }, [ALC287_FIXUP_13S_GEN2_SPEAKERS] = { .type = HDA_FIXUP_VERBS, .v.verbs = (const struct hda_verb[]) { @@ -9022,6 +9045,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x8783, "HP ZBook Fury 15 G7 Mobile Workstation", ALC285_FIXUP_HP_GPIO_AMP_INIT), + SND_PCI_QUIRK(0x103c, 0x8787, "HP OMEN 15", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x8788, "HP OMEN 15", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x87c8, "HP", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x87e5, "HP ProBook 440 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), @@ -9187,6 +9211,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x70f3, "Clevo NH77DPQ", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70f4, "Clevo NH77EPY", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70f6, "Clevo NH77DPQ-Y", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x7716, "Clevo NS50PU", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8228, "Clevo NR40BU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8520, "Clevo NH50D[CD]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8521, "Clevo NH77D[CD]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), @@ -9273,7 +9298,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x31af, "ThinkCentre Station", ALC623_FIXUP_LENOVO_THINKSTATION_P340), SND_PCI_QUIRK(0x17aa, 0x3802, "Lenovo Yoga DuetITL 2021", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), - SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940", ALC298_FIXUP_LENOVO_SPK_VOLUME), + SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940 / Yoga Duet 7", ALC298_FIXUP_LENOVO_C940_DUET7), SND_PCI_QUIRK(0x17aa, 0x3819, "Lenovo 13s Gen2 ITL", ALC287_FIXUP_13S_GEN2_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3820, "Yoga Duet 7 13ITL6", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3824, "Legion Y9000X 2020", ALC285_FIXUP_LEGION_Y9000X_SPEAKERS), @@ -10737,6 +10762,7 @@ enum { ALC668_FIXUP_MIC_DET_COEF, ALC897_FIXUP_LENOVO_HEADSET_MIC, ALC897_FIXUP_HEADSET_MIC_PIN, + ALC897_FIXUP_HP_HSMIC_VERB, }; static const struct hda_fixup alc662_fixups[] = { @@ -11156,6 +11182,13 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC897_FIXUP_LENOVO_HEADSET_MIC }, + [ALC897_FIXUP_HP_HSMIC_VERB] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x01a1913c }, /* use as headset mic, without its own jack detect */ + { } + }, + }, }; static const struct snd_pci_quirk alc662_fixup_tbl[] = { @@ -11181,6 +11214,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0698, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800), + SND_PCI_QUIRK(0x103c, 0x8719, "HP", ALC897_FIXUP_HP_HSMIC_VERB), SND_PCI_QUIRK(0x103c, 0x873e, "HP", ALC671_FIXUP_HP_HEADSET_MIC2), SND_PCI_QUIRK(0x103c, 0x885f, "HP 288 Pro G8", ALC671_FIXUP_HP_HEADSET_MIC2), SND_PCI_QUIRK(0x1043, 0x1080, "Asus UX501VW", ALC668_FIXUP_HEADSET_MODE), diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c index a05304f340df..aea7fae2ca4b 100644 --- a/sound/pci/hda/patch_via.c +++ b/sound/pci/hda/patch_via.c @@ -518,11 +518,11 @@ static int via_parse_auto_config(struct hda_codec *codec) if (err < 0) return err; - err = snd_hda_gen_parse_auto_config(codec, &spec->gen.autocfg); + err = auto_parse_beep(codec); if (err < 0) return err; - err = auto_parse_beep(codec); + err = snd_hda_gen_parse_auto_config(codec, &spec->gen.autocfg); if (err < 0) return err; diff --git a/sound/usb/mixer_us16x08.c b/sound/usb/mixer_us16x08.c index b7b6f3834ed5..6eb7d93b358d 100644 --- a/sound/usb/mixer_us16x08.c +++ b/sound/usb/mixer_us16x08.c @@ -637,10 +637,10 @@ static int snd_get_meter_comp_index(struct snd_us16x08_meter_store *store) } } else { /* skip channels with no compressor active */ - while (!store->comp_store->val[ + while (store->comp_index <= SND_US16X08_MAX_CHANNELS + && !store->comp_store->val[ COMP_STORE_IDX(SND_US16X08_ID_COMP_SWITCH)] - [store->comp_index - 1] - && store->comp_index <= SND_US16X08_MAX_CHANNELS) { + [store->comp_index - 1]) { store->comp_index++; } ret = store->comp_index++; diff --git a/sound/x86/intel_hdmi_audio.c b/sound/x86/intel_hdmi_audio.c index 0d828e35b401..ab95fb34a635 100644 --- a/sound/x86/intel_hdmi_audio.c +++ b/sound/x86/intel_hdmi_audio.c @@ -33,6 +33,8 @@ #include <drm/intel_lpe_audio.h> #include "intel_hdmi_audio.h" +#define INTEL_HDMI_AUDIO_SUSPEND_DELAY_MS 5000 + #define for_each_pipe(card_ctx, pipe) \ for ((pipe) = 0; (pipe) < (card_ctx)->num_pipes; (pipe)++) #define for_each_port(card_ctx, port) \ @@ -1066,7 +1068,9 @@ static int had_pcm_open(struct snd_pcm_substream *substream) intelhaddata = snd_pcm_substream_chip(substream); runtime = substream->runtime; - pm_runtime_get_sync(intelhaddata->dev); + retval = pm_runtime_resume_and_get(intelhaddata->dev); + if (retval < 0) + return retval; /* set the runtime hw parameter with local snd_pcm_hardware struct */ runtime->hw = had_pcm_hardware; @@ -1534,8 +1538,12 @@ static void had_audio_wq(struct work_struct *work) container_of(work, struct snd_intelhad, hdmi_audio_wq); struct intel_hdmi_lpe_audio_pdata *pdata = ctx->dev->platform_data; struct intel_hdmi_lpe_audio_port_pdata *ppdata = &pdata->port[ctx->port]; + int ret; + + ret = pm_runtime_resume_and_get(ctx->dev); + if (ret < 0) + return; - pm_runtime_get_sync(ctx->dev); mutex_lock(&ctx->mutex); if (ppdata->pipe < 0) { dev_dbg(ctx->dev, "%s: Event: HAD_NOTIFY_HOT_UNPLUG : port = %d\n", @@ -1802,8 +1810,11 @@ static int __hdmi_lpe_audio_probe(struct platform_device *pdev) pdata->notify_audio_lpe = notify_audio_lpe; spin_unlock_irq(&pdata->lpe_audio_slock); + pm_runtime_set_autosuspend_delay(&pdev->dev, INTEL_HDMI_AUDIO_SUSPEND_DELAY_MS); pm_runtime_use_autosuspend(&pdev->dev); + pm_runtime_enable(&pdev->dev); pm_runtime_mark_last_busy(&pdev->dev); + pm_runtime_idle(&pdev->dev); dev_dbg(&pdev->dev, "%s: handle pending notification\n", __func__); for_each_port(card_ctx, port) { diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index e09d6908a21d..8aa0d276a636 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -36,7 +36,7 @@ #define MIDR_VARIANT(midr) \ (((midr) & MIDR_VARIANT_MASK) >> MIDR_VARIANT_SHIFT) #define MIDR_IMPLEMENTOR_SHIFT 24 -#define MIDR_IMPLEMENTOR_MASK (0xff << MIDR_IMPLEMENTOR_SHIFT) +#define MIDR_IMPLEMENTOR_MASK (0xffU << MIDR_IMPLEMENTOR_SHIFT) #define MIDR_IMPLEMENTOR(midr) \ (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) @@ -118,6 +118,10 @@ #define APPLE_CPU_PART_M1_ICESTORM 0x022 #define APPLE_CPU_PART_M1_FIRESTORM 0x023 +#define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024 +#define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 +#define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 +#define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) @@ -164,6 +168,10 @@ #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) +#define MIDR_APPLE_M1_ICESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_PRO) +#define MIDR_APPLE_M1_FIRESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO) +#define MIDR_APPLE_M1_ICESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_MAX) +#define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX @@ -172,7 +180,7 @@ #ifndef __ASSEMBLY__ -#include "sysreg.h" +#include <asm/sysreg.h> #define read_cpuid(reg) read_sysreg_s(SYS_ ## reg) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index bf6e96011dfe..21614807a2cb 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -428,11 +428,12 @@ struct kvm_sync_regs { struct kvm_vcpu_events events; }; -#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) -#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) -#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) -#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) -#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) +#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) +#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 6491fa8fba6d..15b940ec1eac 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -143,6 +143,12 @@ struct unwind_hint { .popsection .endm +.macro STACK_FRAME_NON_STANDARD_FP func:req +#ifdef CONFIG_FRAME_POINTER + STACK_FRAME_NON_STANDARD \func +#endif +.endm + .macro ANNOTATE_NOENDBR .Lhere_\@: .pushsection .discard.noendbr diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index e998764f0262..a5e06dcbba13 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -272,6 +272,15 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +/* arm64 Scalable Matrix Extension controls */ +/* Flag values must be in sync with SVE versions */ +#define PR_SME_SET_VL 63 /* set task vector length */ +# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SME_GET_VL 64 /* get task vector length */ +/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */ +# define PR_SME_VL_LEN_MASK 0xffff +# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index c1d58673f6ef..952f3520d5c2 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -149,23 +149,30 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus, int fd, group_fd, *evsel_fd; evsel_fd = FD(evsel, idx, thread); - if (evsel_fd == NULL) - return -EINVAL; + if (evsel_fd == NULL) { + err = -EINVAL; + goto out; + } err = get_group_fd(evsel, idx, thread, &group_fd); if (err < 0) - return err; + goto out; fd = sys_perf_event_open(&evsel->attr, threads->map[thread].pid, cpu, group_fd, 0); - if (fd < 0) - return -errno; + if (fd < 0) { + err = -errno; + goto out; + } *evsel_fd = fd; } } +out: + if (err) + perf_evsel__close(evsel); return err; } diff --git a/tools/perf/tests/bp_account.c b/tools/perf/tests/bp_account.c index d1ebb5561e5b..6f921db33cf9 100644 --- a/tools/perf/tests/bp_account.c +++ b/tools/perf/tests/bp_account.c @@ -151,11 +151,21 @@ static int detect_ioctl(void) static int detect_share(int wp_cnt, int bp_cnt) { struct perf_event_attr attr; - int i, fd[wp_cnt + bp_cnt], ret; + int i, *fd = NULL, ret = -1; + + if (wp_cnt + bp_cnt == 0) + return 0; + + fd = malloc(sizeof(int) * (wp_cnt + bp_cnt)); + if (!fd) + return -1; for (i = 0; i < wp_cnt; i++) { fd[i] = wp_event((void *)&the_var, &attr); - TEST_ASSERT_VAL("failed to create wp\n", fd[i] != -1); + if (fd[i] == -1) { + pr_err("failed to create wp\n"); + goto out; + } } for (; i < (bp_cnt + wp_cnt); i++) { @@ -166,9 +176,11 @@ static int detect_share(int wp_cnt, int bp_cnt) ret = i != (bp_cnt + wp_cnt); +out: while (i--) close(fd[i]); + free(fd); return ret; } diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index d54c5371c6a6..5c0032fe93ae 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -97,6 +97,8 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u ret |= test(ctx, "2.2 > 2.2", 0); ret |= test(ctx, "2.2 < 1.1", 0); ret |= test(ctx, "1.1 > 2.2", 0); + ret |= test(ctx, "1.1e10 < 1.1e100", 1); + ret |= test(ctx, "1.1e2 > 1.1e-2", 1); if (ret) { expr__ctx_free(ctx); diff --git a/tools/perf/tests/shell/lib/perf_csv_output_lint.py b/tools/perf/tests/shell/lib/perf_csv_output_lint.py deleted file mode 100644 index 714f283cfb1b..000000000000 --- a/tools/perf/tests/shell/lib/perf_csv_output_lint.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/python -# SPDX-License-Identifier: GPL-2.0 - -import argparse -import sys - -# Basic sanity check of perf CSV output as specified in the man page. -# Currently just checks the number of fields per line in output. - -ap = argparse.ArgumentParser() -ap.add_argument('--no-args', action='store_true') -ap.add_argument('--interval', action='store_true') -ap.add_argument('--system-wide-no-aggr', action='store_true') -ap.add_argument('--system-wide', action='store_true') -ap.add_argument('--event', action='store_true') -ap.add_argument('--per-core', action='store_true') -ap.add_argument('--per-thread', action='store_true') -ap.add_argument('--per-die', action='store_true') -ap.add_argument('--per-node', action='store_true') -ap.add_argument('--per-socket', action='store_true') -ap.add_argument('--separator', default=',', nargs='?') -args = ap.parse_args() - -Lines = sys.stdin.readlines() - -def check_csv_output(exp): - for line in Lines: - if 'failed' not in line: - count = line.count(args.separator) - if count != exp: - sys.stdout.write(''.join(Lines)) - raise RuntimeError(f'wrong number of fields. expected {exp} in {line}') - -try: - if args.no_args or args.system_wide or args.event: - expected_items = 6 - elif args.interval or args.per_thread or args.system_wide_no_aggr: - expected_items = 7 - elif args.per_core or args.per_socket or args.per_node or args.per_die: - expected_items = 8 - else: - ap.print_help() - raise RuntimeError('No checking option specified') - check_csv_output(expected_items) - -except: - sys.stdout.write('Test failed for input: ' + ''.join(Lines)) - raise diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh index 983220ef3cb4..38c26f3ef4c1 100755 --- a/tools/perf/tests/shell/stat+csv_output.sh +++ b/tools/perf/tests/shell/stat+csv_output.sh @@ -6,20 +6,41 @@ set -e -pythonchecker=$(dirname $0)/lib/perf_csv_output_lint.py -if [ "x$PYTHON" == "x" ] -then - if which python3 > /dev/null - then - PYTHON=python3 - elif which python > /dev/null - then - PYTHON=python - else - echo Skipping test, python not detected please set environment variable PYTHON. - exit 2 - fi -fi +function commachecker() +{ + local -i cnt=0 exp=0 + + case "$1" + in "--no-args") exp=6 + ;; "--system-wide") exp=6 + ;; "--event") exp=6 + ;; "--interval") exp=7 + ;; "--per-thread") exp=7 + ;; "--system-wide-no-aggr") exp=7 + [ $(uname -m) = "s390x" ] && exp=6 + ;; "--per-core") exp=8 + ;; "--per-socket") exp=8 + ;; "--per-node") exp=8 + ;; "--per-die") exp=8 + esac + + while read line + do + # Check for lines beginning with Failed + x=${line:0:6} + [ "$x" = "Failed" ] && continue + + # Count the number of commas + x=$(echo $line | tr -d -c ',') + cnt="${#x}" + # echo $line $cnt + [ "$cnt" -ne "$exp" ] && { + echo "wrong number of fields. expected $exp in $line" 1>&2 + exit 1; + } + done + return 0 +} # Return true if perf_event_paranoid is > $1 and not running as root. function ParanoidAndNotRoot() @@ -30,7 +51,7 @@ function ParanoidAndNotRoot() check_no_args() { echo -n "Checking CSV output: no args " - perf stat -x, true 2>&1 | $PYTHON $pythonchecker --no-args + perf stat -x, true 2>&1 | commachecker --no-args echo "[Success]" } @@ -42,7 +63,7 @@ check_system_wide() echo "[Skip] paranoid and not root" return fi - perf stat -x, -a true 2>&1 | $PYTHON $pythonchecker --system-wide + perf stat -x, -a true 2>&1 | commachecker --system-wide echo "[Success]" } @@ -55,14 +76,14 @@ check_system_wide_no_aggr() return fi echo -n "Checking CSV output: system wide no aggregation " - perf stat -x, -A -a --no-merge true 2>&1 | $PYTHON $pythonchecker --system-wide-no-aggr + perf stat -x, -A -a --no-merge true 2>&1 | commachecker --system-wide-no-aggr echo "[Success]" } check_interval() { echo -n "Checking CSV output: interval " - perf stat -x, -I 1000 true 2>&1 | $PYTHON $pythonchecker --interval + perf stat -x, -I 1000 true 2>&1 | commachecker --interval echo "[Success]" } @@ -70,7 +91,7 @@ check_interval() check_event() { echo -n "Checking CSV output: event " - perf stat -x, -e cpu-clock true 2>&1 | $PYTHON $pythonchecker --event + perf stat -x, -e cpu-clock true 2>&1 | commachecker --event echo "[Success]" } @@ -82,7 +103,7 @@ check_per_core() echo "[Skip] paranoid and not root" return fi - perf stat -x, --per-core -a true 2>&1 | $PYTHON $pythonchecker --per-core + perf stat -x, --per-core -a true 2>&1 | commachecker --per-core echo "[Success]" } @@ -94,7 +115,7 @@ check_per_thread() echo "[Skip] paranoid and not root" return fi - perf stat -x, --per-thread -a true 2>&1 | $PYTHON $pythonchecker --per-thread + perf stat -x, --per-thread -a true 2>&1 | commachecker --per-thread echo "[Success]" } @@ -106,7 +127,7 @@ check_per_die() echo "[Skip] paranoid and not root" return fi - perf stat -x, --per-die -a true 2>&1 | $PYTHON $pythonchecker --per-die + perf stat -x, --per-die -a true 2>&1 | commachecker --per-die echo "[Success]" } @@ -118,7 +139,7 @@ check_per_node() echo "[Skip] paranoid and not root" return fi - perf stat -x, --per-node -a true 2>&1 | $PYTHON $pythonchecker --per-node + perf stat -x, --per-node -a true 2>&1 | commachecker --per-node echo "[Success]" } @@ -130,7 +151,7 @@ check_per_socket() echo "[Skip] paranoid and not root" return fi - perf stat -x, --per-socket -a true 2>&1 | $PYTHON $pythonchecker --per-socket + perf stat -x, --per-socket -a true 2>&1 | commachecker --per-socket echo "[Success]" } diff --git a/tools/perf/tests/shell/test_arm_callgraph_fp.sh b/tools/perf/tests/shell/test_arm_callgraph_fp.sh index 6ffbb27afaba..ec108d45d3c6 100755 --- a/tools/perf/tests/shell/test_arm_callgraph_fp.sh +++ b/tools/perf/tests/shell/test_arm_callgraph_fp.sh @@ -43,7 +43,7 @@ CFLAGS="-g -O0 -fno-inline -fno-omit-frame-pointer" cc $CFLAGS $TEST_PROGRAM_SOURCE -o $TEST_PROGRAM || exit 1 # Add a 1 second delay to skip samples that are not in the leaf() function -perf record -o $PERF_DATA --call-graph fp -e cycles//u -D 1000 -- $TEST_PROGRAM 2> /dev/null & +perf record -o $PERF_DATA --call-graph fp -e cycles//u -D 1000 --user-callchains -- $TEST_PROGRAM 2> /dev/null & PID=$! echo " + Recording (PID=$PID)..." diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index d23a9e322ff5..0b4f61b6cc6b 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -115,7 +115,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) * physical_package_id will be set to -1. Hence skip this * test if physical_package_id returns -1 for cpu from perf_cpu_map. */ - if (strncmp(session->header.env.arch, "powerpc", 7)) { + if (!strncmp(session->header.env.arch, "ppc64le", 7)) { if (cpu__get_socket_id(perf_cpu_map__cpu(map, 0)) == -1) return TEST_SKIP; } diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 6f85f5d957ef..17311ad9f9af 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -50,6 +50,9 @@ struct linger { struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ + + int msg_inq; /* output, data left in socket */ + struct iov_iter msg_iter; /* data */ /* @@ -62,8 +65,9 @@ struct msghdr { void __user *msg_control_user; }; bool msg_control_is_user : 1; - __kernel_size_t msg_controllen; /* ancillary data buffer length */ + bool msg_get_inq : 1;/* return INQ after receive */ unsigned int msg_flags; /* flags on received message */ + __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ }; @@ -434,6 +438,7 @@ extern struct file *do_accept(struct file *file, unsigned file_flags, extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); +extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 1a80151baed9..d040406f3314 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -387,26 +387,16 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, return arm_spe_deliver_synth_event(spe, speq, event, &sample); } -#define SPE_MEM_TYPE (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS | \ - ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS | \ - ARM_SPE_REMOTE_ACCESS) - -static bool arm_spe__is_memory_event(enum arm_spe_sample_type type) -{ - if (type & SPE_MEM_TYPE) - return true; - - return false; -} - static u64 arm_spe__synth_data_source(const struct arm_spe_record *record) { union perf_mem_data_src data_src = { 0 }; if (record->op == ARM_SPE_LD) data_src.mem_op = PERF_MEM_OP_LOAD; - else + else if (record->op == ARM_SPE_ST) data_src.mem_op = PERF_MEM_OP_STORE; + else + return 0; if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { data_src.mem_lvl = PERF_MEM_LVL_L3; @@ -510,7 +500,11 @@ static int arm_spe_sample(struct arm_spe_queue *speq) return err; } - if (spe->sample_memory && arm_spe__is_memory_event(record->type)) { + /* + * When data_src is zero it means the record is not a memory operation, + * skip to synthesize memory sample for this case. + */ + if (spe->sample_memory && data_src) { err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); if (err) return err; diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l index 0a13eb20c814..4dc8edbfd9ce 100644 --- a/tools/perf/util/expr.l +++ b/tools/perf/util/expr.l @@ -91,7 +91,7 @@ static int literal(yyscan_t scanner) } %} -number ([0-9]+\.?[0-9]*|[0-9]*\.?[0-9]+) +number ([0-9]+\.?[0-9]*|[0-9]*\.?[0-9]+)(e-?[0-9]+)? sch [-,=] spec \\{sch} diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index ee8fcfa115e5..8f7baeabc5cf 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1372,6 +1372,7 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu, *out_evlist = NULL; if (!metric_no_merge || hashmap__size(ids->ids) == 0) { + bool added_event = false; int i; /* * We may fail to share events between metrics because a tool @@ -1393,8 +1394,16 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu, if (!tmp) return -ENOMEM; ids__insert(ids->ids, tmp); + added_event = true; } } + if (!added_event && hashmap__size(ids->ids) == 0) { + char *tmp = strdup("duration_time"); + + if (!tmp) + return -ENOMEM; + ids__insert(ids->ids, tmp); + } } ret = metricgroup__build_event_string(&events, ids, modifier, has_constraint); diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 37622699c91a..6e5b8cce47bf 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -174,7 +174,7 @@ static int elf_section_address_and_offset(int fd, const char *name, u64 *address Elf *elf; GElf_Ehdr ehdr; GElf_Shdr shdr; - int ret; + int ret = -1; elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); if (elf == NULL) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 83ef55e3caa4..2974b44f80fa 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -121,24 +121,24 @@ static void kprobe_multi_link_api_subtest(void) }) GET_ADDR("bpf_fentry_test1", addrs[0]); - GET_ADDR("bpf_fentry_test2", addrs[1]); - GET_ADDR("bpf_fentry_test3", addrs[2]); - GET_ADDR("bpf_fentry_test4", addrs[3]); - GET_ADDR("bpf_fentry_test5", addrs[4]); - GET_ADDR("bpf_fentry_test6", addrs[5]); - GET_ADDR("bpf_fentry_test7", addrs[6]); + GET_ADDR("bpf_fentry_test3", addrs[1]); + GET_ADDR("bpf_fentry_test4", addrs[2]); + GET_ADDR("bpf_fentry_test5", addrs[3]); + GET_ADDR("bpf_fentry_test6", addrs[4]); + GET_ADDR("bpf_fentry_test7", addrs[5]); + GET_ADDR("bpf_fentry_test2", addrs[6]); GET_ADDR("bpf_fentry_test8", addrs[7]); #undef GET_ADDR - cookies[0] = 1; - cookies[1] = 2; - cookies[2] = 3; - cookies[3] = 4; - cookies[4] = 5; - cookies[5] = 6; - cookies[6] = 7; - cookies[7] = 8; + cookies[0] = 1; /* bpf_fentry_test1 */ + cookies[1] = 2; /* bpf_fentry_test3 */ + cookies[2] = 3; /* bpf_fentry_test4 */ + cookies[3] = 4; /* bpf_fentry_test5 */ + cookies[4] = 5; /* bpf_fentry_test6 */ + cookies[5] = 6; /* bpf_fentry_test7 */ + cookies[6] = 7; /* bpf_fentry_test2 */ + cookies[7] = 8; /* bpf_fentry_test8 */ opts.kprobe_multi.addrs = (const unsigned long *) &addrs; opts.kprobe_multi.cnt = ARRAY_SIZE(addrs); @@ -149,14 +149,14 @@ static void kprobe_multi_link_api_subtest(void) if (!ASSERT_GE(link1_fd, 0, "link1_fd")) goto cleanup; - cookies[0] = 8; - cookies[1] = 7; - cookies[2] = 6; - cookies[3] = 5; - cookies[4] = 4; - cookies[5] = 3; - cookies[6] = 2; - cookies[7] = 1; + cookies[0] = 8; /* bpf_fentry_test1 */ + cookies[1] = 7; /* bpf_fentry_test3 */ + cookies[2] = 6; /* bpf_fentry_test4 */ + cookies[3] = 5; /* bpf_fentry_test5 */ + cookies[4] = 4; /* bpf_fentry_test6 */ + cookies[5] = 3; /* bpf_fentry_test7 */ + cookies[6] = 2; /* bpf_fentry_test2 */ + cookies[7] = 1; /* bpf_fentry_test8 */ opts.kprobe_multi.flags = BPF_F_KPROBE_MULTI_RETURN; prog_fd = bpf_program__fd(skel->progs.test_kretprobe); @@ -181,12 +181,12 @@ static void kprobe_multi_attach_api_subtest(void) struct kprobe_multi *skel = NULL; const char *syms[8] = { "bpf_fentry_test1", - "bpf_fentry_test2", "bpf_fentry_test3", "bpf_fentry_test4", "bpf_fentry_test5", "bpf_fentry_test6", "bpf_fentry_test7", + "bpf_fentry_test2", "bpf_fentry_test8", }; __u64 cookies[8]; @@ -198,14 +198,14 @@ static void kprobe_multi_attach_api_subtest(void) skel->bss->pid = getpid(); skel->bss->test_cookie = true; - cookies[0] = 1; - cookies[1] = 2; - cookies[2] = 3; - cookies[3] = 4; - cookies[4] = 5; - cookies[5] = 6; - cookies[6] = 7; - cookies[7] = 8; + cookies[0] = 1; /* bpf_fentry_test1 */ + cookies[1] = 2; /* bpf_fentry_test3 */ + cookies[2] = 3; /* bpf_fentry_test4 */ + cookies[3] = 4; /* bpf_fentry_test5 */ + cookies[4] = 5; /* bpf_fentry_test6 */ + cookies[5] = 6; /* bpf_fentry_test7 */ + cookies[6] = 7; /* bpf_fentry_test2 */ + cookies[7] = 8; /* bpf_fentry_test8 */ opts.syms = syms; opts.cnt = ARRAY_SIZE(syms); @@ -216,14 +216,14 @@ static void kprobe_multi_attach_api_subtest(void) if (!ASSERT_OK_PTR(link1, "bpf_program__attach_kprobe_multi_opts")) goto cleanup; - cookies[0] = 8; - cookies[1] = 7; - cookies[2] = 6; - cookies[3] = 5; - cookies[4] = 4; - cookies[5] = 3; - cookies[6] = 2; - cookies[7] = 1; + cookies[0] = 8; /* bpf_fentry_test1 */ + cookies[1] = 7; /* bpf_fentry_test3 */ + cookies[2] = 6; /* bpf_fentry_test4 */ + cookies[3] = 5; /* bpf_fentry_test5 */ + cookies[4] = 4; /* bpf_fentry_test6 */ + cookies[5] = 3; /* bpf_fentry_test7 */ + cookies[6] = 2; /* bpf_fentry_test2 */ + cookies[7] = 1; /* bpf_fentry_test8 */ opts.retprobe = true; diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 586dc52d6fb9..5b93d5d0bd93 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -364,6 +364,9 @@ static int get_syms(char ***symsp, size_t *cntp) continue; if (!strncmp(name, "rcu_", 4)) continue; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + continue; err = hashmap__add(map, name, NULL); if (err) { free(name); diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index c4da87ec3ba4..19c70880cfb3 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -831,6 +831,59 @@ out: bpf_object__close(obj); } +#include "tailcall_bpf2bpf6.skel.h" + +/* Tail call counting works even when there is data on stack which is + * not aligned to 8 bytes. + */ +static void test_tailcall_bpf2bpf_6(void) +{ + struct tailcall_bpf2bpf6 *obj; + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + obj = tailcall_bpf2bpf6__open_and_load(); + if (!ASSERT_OK_PTR(obj, "open and load")) + return; + + main_fd = bpf_program__fd(obj->progs.entry); + if (!ASSERT_GE(main_fd, 0, "entry prog fd")) + goto out; + + map_fd = bpf_map__fd(obj->maps.jmp_table); + if (!ASSERT_GE(map_fd, 0, "jmp_table map fd")) + goto out; + + prog_fd = bpf_program__fd(obj->progs.classifier_0); + if (!ASSERT_GE(prog_fd, 0, "classifier_0 prog fd")) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "jmp_table map update")) + goto out; + + err = bpf_prog_test_run_opts(main_fd, &topts); + ASSERT_OK(err, "entry prog test run"); + ASSERT_EQ(topts.retval, 0, "tailcall retval"); + + data_fd = bpf_map__fd(obj->maps.bss); + if (!ASSERT_GE(map_fd, 0, "bss map fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + ASSERT_OK(err, "bss map lookup"); + ASSERT_EQ(val, 1, "done flag is set"); + +out: + tailcall_bpf2bpf6__destroy(obj); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -855,4 +908,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_4(false); if (test__start_subtest("tailcall_bpf2bpf_5")) test_tailcall_bpf2bpf_4(true); + if (test__start_subtest("tailcall_bpf2bpf_6")) + test_tailcall_bpf2bpf_6(); } diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi.c b/tools/testing/selftests/bpf/progs/kprobe_multi.c index 93510f4f0f3a..08f95a8155d1 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi.c @@ -54,21 +54,21 @@ static void kprobe_multi_check(void *ctx, bool is_return) if (is_return) { SET(kretprobe_test1_result, &bpf_fentry_test1, 8); - SET(kretprobe_test2_result, &bpf_fentry_test2, 7); - SET(kretprobe_test3_result, &bpf_fentry_test3, 6); - SET(kretprobe_test4_result, &bpf_fentry_test4, 5); - SET(kretprobe_test5_result, &bpf_fentry_test5, 4); - SET(kretprobe_test6_result, &bpf_fentry_test6, 3); - SET(kretprobe_test7_result, &bpf_fentry_test7, 2); + SET(kretprobe_test2_result, &bpf_fentry_test2, 2); + SET(kretprobe_test3_result, &bpf_fentry_test3, 7); + SET(kretprobe_test4_result, &bpf_fentry_test4, 6); + SET(kretprobe_test5_result, &bpf_fentry_test5, 5); + SET(kretprobe_test6_result, &bpf_fentry_test6, 4); + SET(kretprobe_test7_result, &bpf_fentry_test7, 3); SET(kretprobe_test8_result, &bpf_fentry_test8, 1); } else { SET(kprobe_test1_result, &bpf_fentry_test1, 1); - SET(kprobe_test2_result, &bpf_fentry_test2, 2); - SET(kprobe_test3_result, &bpf_fentry_test3, 3); - SET(kprobe_test4_result, &bpf_fentry_test4, 4); - SET(kprobe_test5_result, &bpf_fentry_test5, 5); - SET(kprobe_test6_result, &bpf_fentry_test6, 6); - SET(kprobe_test7_result, &bpf_fentry_test7, 7); + SET(kprobe_test2_result, &bpf_fentry_test2, 7); + SET(kprobe_test3_result, &bpf_fentry_test3, 2); + SET(kprobe_test4_result, &bpf_fentry_test4, 3); + SET(kprobe_test5_result, &bpf_fentry_test5, 4); + SET(kprobe_test6_result, &bpf_fentry_test6, 5); + SET(kprobe_test7_result, &bpf_fentry_test7, 6); SET(kprobe_test8_result, &bpf_fentry_test8, 8); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c new file mode 100644 index 000000000000..41ce83da78e8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#define __unused __attribute__((unused)) + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int done = 0; + +SEC("tc") +int classifier_0(struct __sk_buff *skb __unused) +{ + done = 1; + return 0; +} + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + /* Don't propagate the constant to the caller */ + volatile int ret = 1; + + bpf_tail_call_static(skb, &jmp_table, 0); + return ret; +} + +SEC("tc") +int entry(struct __sk_buff *skb) +{ + /* Have data on stack which size is not a multiple of 8 */ + volatile char arr[1] = {}; + + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/dma/Makefile b/tools/testing/selftests/dma/Makefile index aa8e8b5b3864..cd8c5ece1cba 100644 --- a/tools/testing/selftests/dma/Makefile +++ b/tools/testing/selftests/dma/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../include/ TEST_GEN_PROGS := dma_map_benchmark diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index c3b3c09e995e..5c997f17fcbd 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -10,8 +10,8 @@ #include <unistd.h> #include <sys/ioctl.h> #include <sys/mman.h> -#include <linux/map_benchmark.h> #include <linux/types.h> +#include <linux/map_benchmark.h> #define NSEC_PER_MSEC 1000000L diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 2a2d240cdc1b..1a5cc3cd97ec 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -7,10 +7,31 @@ else ifneq ($(filter -%,$(LLVM)),) LLVM_SUFFIX := $(LLVM) endif -CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) +CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi +CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu +CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl +CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu +CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu +CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu +CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu +CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu +CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +ifeq ($(CROSS_COMPILE),) +ifeq ($(CLANG_TARGET_FLAGS),) +$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk +else +CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) +endif # CLANG_TARGET_FLAGS +else +CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif # CROSS_COMPILE + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as else CC := $(CROSS_COMPILE)gcc -endif +endif # LLVM ifeq (0,$(MAKELEVEL)) ifeq ($(OUTPUT),) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 54701c8b0cd7..03b586760164 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -70,6 +70,10 @@ NSB_LO_IP6=2001:db8:2::2 NL_IP=172.17.1.1 NL_IP6=2001:db8:4::1 +# multicast and broadcast addresses +MCAST_IP=224.0.0.1 +BCAST_IP=255.255.255.255 + MD5_PW=abc123 MD5_WRONG_PW=abc1234 @@ -308,6 +312,9 @@ addr2str() 127.0.0.1) echo "loopback";; ::1) echo "IPv6 loopback";; + ${BCAST_IP}) echo "broadcast";; + ${MCAST_IP}) echo "multicast";; + ${NSA_IP}) echo "ns-A IP";; ${NSA_IP6}) echo "ns-A IPv6";; ${NSA_LO_IP}) echo "ns-A loopback IP";; @@ -1793,12 +1800,33 @@ ipv4_addr_bind_novrf() done # - # raw socket with nonlocal bind + # tests for nonlocal bind # a=${NL_IP} log_start - run_cmd nettest -s -R -P icmp -f -l ${a} -I ${NSA_DEV} -b - log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after device bind" + run_cmd nettest -s -R -f -l ${a} -b + log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address" + + log_start + run_cmd nettest -s -f -l ${a} -b + log_test_addr ${a} $? 0 "TCP socket bind to nonlocal address" + + log_start + run_cmd nettest -s -D -P icmp -f -l ${a} -b + log_test_addr ${a} $? 0 "ICMP socket bind to nonlocal address" + + # + # check that ICMP sockets cannot bind to broadcast and multicast addresses + # + a=${BCAST_IP} + log_start + run_cmd nettest -s -D -P icmp -l ${a} -b + log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address" + + a=${MCAST_IP} + log_start + run_cmd nettest -s -D -P icmp -l ${a} -b + log_test_addr ${a} $? 1 "ICMP socket bind to multicast address" # # tcp sockets @@ -1850,13 +1878,34 @@ ipv4_addr_bind_vrf() log_test_addr ${a} $? 1 "Raw socket bind to out of scope address after VRF bind" # - # raw socket with nonlocal bind + # tests for nonlocal bind # a=${NL_IP} log_start - run_cmd nettest -s -R -P icmp -f -l ${a} -I ${VRF} -b + run_cmd nettest -s -R -f -l ${a} -I ${VRF} -b log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after VRF bind" + log_start + run_cmd nettest -s -f -l ${a} -I ${VRF} -b + log_test_addr ${a} $? 0 "TCP socket bind to nonlocal address after VRF bind" + + log_start + run_cmd nettest -s -D -P icmp -f -l ${a} -I ${VRF} -b + log_test_addr ${a} $? 0 "ICMP socket bind to nonlocal address after VRF bind" + + # + # check that ICMP sockets cannot bind to broadcast and multicast addresses + # + a=${BCAST_IP} + log_start + run_cmd nettest -s -D -P icmp -l ${a} -I ${VRF} -b + log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address after VRF bind" + + a=${MCAST_IP} + log_start + run_cmd nettest -s -D -P icmp -l ${a} -I ${VRF} -b + log_test_addr ${a} $? 1 "ICMP socket bind to multicast address after VRF bind" + # # tcp sockets # @@ -1889,10 +1938,12 @@ ipv4_addr_bind() log_subsection "No VRF" setup + set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null ipv4_addr_bind_novrf log_subsection "With VRF" setup "yes" + set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null ipv4_addr_bind_vrf } diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh index b35010cc7f6a..a6991877e50c 100755 --- a/tools/testing/selftests/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/netfilter/nft_concat_range.sh @@ -31,7 +31,7 @@ BUGS="flush_remove_add reload" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" - ../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh + ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh pktgen/pktgen_bench_xmit_mode_netif_receive.sh" # Definition of set types: diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 6bb36ca71cb5..a309876d832f 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -209,7 +209,7 @@ int main(int argc, char **argv) if (write) gup.gup_flags |= FOLL_WRITE; - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + gup_fd = open(GUP_TEST_FILE, O_RDWR); if (gup_fd == -1) { switch (errno) { case EACCES: @@ -224,7 +224,7 @@ int main(int argc, char **argv) printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); break; default: - perror("failed to open /sys/kernel/debug/gup_test"); + perror("failed to open " GUP_TEST_FILE); break; } exit(KSFT_SKIP); diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 2fcf24312da8..f5e4e0bbd081 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -54,6 +54,7 @@ static int ksm_write_sysfs(const char *file_path, unsigned long val) } if (fprintf(f, "%lu", val) < 0) { perror("fprintf"); + fclose(f); return 1; } fclose(f); @@ -72,6 +73,7 @@ static int ksm_read_sysfs(const char *file_path, unsigned long *val) } if (fscanf(f, "%lu", val) != 1) { perror("fscanf"); + fclose(f); return 1; } fclose(f); |
