diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-26 18:08:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-26 18:08:55 -0700 |
commit | 9b960d8cd6f712cb2c03e2bdd4d5ca058238037f (patch) | |
tree | a1381af6c79626c0a28679f804477b43b7c91565 | |
parent | 91928e0d3cc29789f4483bffee5f36218f23942b (diff) | |
parent | 3c9f0c9326b625bf008962d58996f89a3bba1e12 (diff) |
Merge tag 'for-6.15/block-20250322' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- Fixes for integrity handling
- NVMe pull request via Keith:
- Secure concatenation for TCP transport (Hannes)
- Multipath sysfs visibility (Nilay)
- Various cleanups (Qasim, Baruch, Wang, Chen, Mike, Damien, Li)
- Correct use of 64-bit BARs for pci-epf target (Niklas)
- Socket fix for selinux when used in containers (Peijie)
- MD pull request via Yu:
- fix recovery can preempt resync (Li Nan)
- fix md-bitmap IO limit (Su Yue)
- fix raid10 discard with REQ_NOWAIT (Xiao Ni)
- fix raid1 memory leak (Zheng Qixing)
- fix mddev uaf (Yu Kuai)
- fix raid1,raid10 IO flags (Yu Kuai)
- some refactor and cleanup (Yu Kuai)
- Series cleaning up and fixing bugs in the bad block handling code
- Improve support for write failure simulation in null_blk
- Various lock ordering fixes
- Fixes for locking for debugfs attributes
- Various ublk related fixes and improvements
- Cleanups for blk-rq-qos wait handling
- blk-throttle fixes
- Fixes for loop dio and sync handling
- Fixes and cleanups for the auto-PI code
- Block side support for hardware encryption keys in blk-crypto
- Various cleanups and fixes
* tag 'for-6.15/block-20250322' of git://git.kernel.dk/linux: (105 commits)
nvmet: replace max(a, min(b, c)) by clamp(val, lo, hi)
nvme-tcp: fix selinux denied when calling sock_sendmsg
nvmet: pci-epf: Always configure BAR0 as 64-bit
nvmet: Remove duplicate uuid_copy
nvme: zns: Simplify nvme_zone_parse_entry()
nvmet: pci-epf: Remove redundant 'flush_workqueue()' calls
nvmet-fc: Remove unused functions
nvme-pci: remove stale comment
nvme-fc: Utilise min3() to simplify queue count calculation
nvme-multipath: Add visibility for queue-depth io-policy
nvme-multipath: Add visibility for numa io-policy
nvme-multipath: Add visibility for round-robin io-policy
nvmet: add tls_concat and tls_key debugfs entries
nvmet-tcp: support secure channel concatenation
nvmet: Add 'sq' argument to alloc_ctrl_args
nvme-fabrics: reset admin connection for secure concatenation
nvme-tcp: request secure channel concatenation
nvme-keyring: add nvme_tls_psk_refresh()
nvme: add nvme_auth_derive_tls_psk()
nvme: add nvme_auth_generate_digest()
...
128 files changed, 4059 insertions, 1561 deletions
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 0cceb2badc83..3879963f0f01 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -109,6 +109,10 @@ Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: Indicates whether a storage device is capable of storing integrity metadata. Set if the device is T10 PI-capable. + This flag is set to 1 if the storage media is formatted + with T10 Protection Information. If the storage media is + not formatted with T10 Protection Information, this flag + is set to 0. What: /sys/block/<disk>/integrity/format @@ -117,6 +121,13 @@ Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: Metadata format for integrity capable block device. E.g. T10-DIF-TYPE1-CRC. + This field describes the type of T10 Protection Information + that the block device can send and receive. + If the device can store application integrity metadata but + no T10 Protection Information profile is used, this field + contains "nop". + If the device does not support integrity metadata, this + field contains "none". What: /sys/block/<disk>/integrity/protection_interval_bytes @@ -142,7 +153,17 @@ Date: June 2008 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: Number of bytes of integrity tag space available per - 512 bytes of data. + protection_interval_bytes, which is typically + the device's logical block size. + This field describes the size of the application tag + if the storage device is formatted with T10 Protection + Information and permits use of the application tag. + The tag_size is reported in bytes and indicates the + space available for adding an opaque tag to each block + (protection_interval_bytes). + If the device does not support T10 Protection Information + (even if the device provides application integrity + metadata space), this field is set to 0. What: /sys/block/<disk>/integrity/write_generate @@ -229,6 +250,17 @@ Description: encryption, refer to Documentation/block/inline-encryption.rst. +What: /sys/block/<disk>/queue/crypto/hw_wrapped_keys +Date: February 2025 +Contact: linux-block@vger.kernel.org +Description: + [RO] The presence of this file indicates that the device + supports hardware-wrapped inline encryption keys, i.e. key blobs + that can only be unwrapped and used by dedicated hardware. For + more information about hardware-wrapped inline encryption keys, + see Documentation/block/inline-encryption.rst. + + What: /sys/block/<disk>/queue/crypto/max_dun_bits Date: February 2022 Contact: linux-block@vger.kernel.org @@ -267,6 +299,15 @@ Description: use with inline encryption. +What: /sys/block/<disk>/queue/crypto/raw_keys +Date: February 2025 +Contact: linux-block@vger.kernel.org +Description: + [RO] The presence of this file indicates that the device + supports raw inline encryption keys, i.e. keys that are managed + in raw, plaintext form in software. + + What: /sys/block/<disk>/queue/dax Date: June 2016 Contact: linux-block@vger.kernel.org diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 90b733422ed4..6380e6ab492b 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -77,10 +77,10 @@ Basic design ============ We introduce ``struct blk_crypto_key`` to represent an inline encryption key and -how it will be used. This includes the actual bytes of the key; the size of the -key; the algorithm and data unit size the key will be used with; and the number -of bytes needed to represent the maximum data unit number the key will be used -with. +how it will be used. This includes the type of the key (raw or +hardware-wrapped); the actual bytes of the key; the size of the key; the +algorithm and data unit size the key will be used with; and the number of bytes +needed to represent the maximum data unit number the key will be used with. We introduce ``struct bio_crypt_ctx`` to represent an encryption context. It contains a data unit number and a pointer to a blk_crypto_key. We add pointers @@ -301,3 +301,250 @@ kernel will pretend that the device does not support hardware inline encryption When the crypto API fallback is enabled, this means that all bios with and encryption context will use the fallback, and IO will complete as usual. When the fallback is disabled, a bio with an encryption context will be failed. + +.. _hardware_wrapped_keys: + +Hardware-wrapped keys +===================== + +Motivation and threat model +--------------------------- + +Linux storage encryption (dm-crypt, fscrypt, eCryptfs, etc.) traditionally +relies on the raw encryption key(s) being present in kernel memory so that the +encryption can be performed. This traditionally isn't seen as a problem because +the key(s) won't be present during an offline attack, which is the main type of +attack that storage encryption is intended to protect from. + +However, there is an increasing desire to also protect users' data from other +types of attacks (to the extent possible), including: + +- Cold boot attacks, where an attacker with physical access to a system suddenly + powers it off, then immediately dumps the system memory to extract recently + in-use encryption keys, then uses these keys to decrypt user data on-disk. + +- Online attacks where the attacker is able to read kernel memory without fully + compromising the system, followed by an offline attack where any extracted + keys can be used to decrypt user data on-disk. An example of such an online + attack would be if the attacker is able to run some code on the system that + exploits a Meltdown-like vulnerability but is unable to escalate privileges. + +- Online attacks where the attacker fully compromises the system, but their data + exfiltration is significantly time-limited and/or bandwidth-limited, so in + order to completely exfiltrate the data they need to extract the encryption + keys to use in a later offline attack. + +Hardware-wrapped keys are a feature of inline encryption hardware that is +designed to protect users' data from the above attacks (to the extent possible), +without introducing limitations such as a maximum number of keys. + +Note that it is impossible to **fully** protect users' data from these attacks. +Even in the attacks where the attacker "just" gets read access to kernel memory, +they can still extract any user data that is present in memory, including +plaintext pagecache pages of encrypted files. The focus here is just on +protecting the encryption keys, as those instantly give access to **all** user +data in any following offline attack, rather than just some of it (where which +data is included in that "some" might not be controlled by the attacker). + +Solution overview +----------------- + +Inline encryption hardware typically has "keyslots" into which software can +program keys for the hardware to use; the contents of keyslots typically can't +be read back by software. As such, the above security goals could be achieved +if the kernel simply erased its copy of the key(s) after programming them into +keyslot(s) and thereafter only referred to them via keyslot number. + +However, that naive approach runs into a couple problems: + +- It limits the number of unlocked keys to the number of keyslots, which + typically is a small number. In cases where there is only one encryption key + system-wide (e.g., a full-disk encryption key), that can be tolerable. + However, in general there can be many logged-in users with many different + keys, and/or many running applications with application-specific encrypted + storage areas. This is especially true if file-based encryption (e.g. + fscrypt) is being used. + +- Inline crypto engines typically lose the contents of their keyslots if the + storage controller (usually UFS or eMMC) is reset. Resetting the storage + controller is a standard error recovery procedure that is executed if certain + types of storage errors occur, and such errors can occur at any time. + Therefore, when inline crypto is being used, the operating system must always + be ready to reprogram the keyslots without user intervention. + +Thus, it is important for the kernel to still have a way to "remind" the +hardware about a key, without actually having the raw key itself. + +Somewhat less importantly, it is also desirable that the raw keys are never +visible to software at all, even while being initially unlocked. This would +ensure that a read-only compromise of system memory will never allow a key to be +extracted to be used off-system, even if it occurs when a key is being unlocked. + +To solve all these problems, some vendors of inline encryption hardware have +made their hardware support *hardware-wrapped keys*. Hardware-wrapped keys +are encrypted keys that can only be unwrapped (decrypted) and used by hardware +-- either by the inline encryption hardware itself, or by a dedicated hardware +block that can directly provision keys to the inline encryption hardware. + +(We refer to them as "hardware-wrapped keys" rather than simply "wrapped keys" +to add some clarity in cases where there could be other types of wrapped keys, +such as in file-based encryption. Key wrapping is a commonly used technique.) + +The key which wraps (encrypts) hardware-wrapped keys is a hardware-internal key +that is never exposed to software; it is either a persistent key (a "long-term +wrapping key") or a per-boot key (an "ephemeral wrapping key"). The long-term +wrapped form of the key is what is initially unlocked, but it is erased from +memory as soon as it is converted into an ephemerally-wrapped key. In-use +hardware-wrapped keys are always ephemerally-wrapped, not long-term wrapped. + +As inline encryption hardware can only be used to encrypt/decrypt data on-disk, +the hardware also includes a level of indirection; it doesn't use the unwrapped +key directly for inline encryption, but rather derives both an inline encryption +key and a "software secret" from it. Software can use the "software secret" for +tasks that can't use the inline encryption hardware, such as filenames +encryption. The software secret is not protected from memory compromise. + +Key hierarchy +------------- + +Here is the key hierarchy for a hardware-wrapped key:: + + Hardware-wrapped key + | + | + <Hardware KDF> + | + ----------------------------- + | | + Inline encryption key Software secret + +The components are: + +- *Hardware-wrapped key*: a key for the hardware's KDF (Key Derivation + Function), in ephemerally-wrapped form. The key wrapping algorithm is a + hardware implementation detail that doesn't impact kernel operation, but a + strong authenticated encryption algorithm such as AES-256-GCM is recommended. + +- *Hardware KDF*: a KDF (Key Derivation Function) which the hardware uses to + derive subkeys after unwrapping the wrapped key. The hardware's choice of KDF + doesn't impact kernel operation, but it does need to be known for testing + purposes, and it's also assumed to have at least a 256-bit security strength. + All known hardware uses the SP800-108 KDF in Counter Mode with AES-256-CMAC, + with a particular choice of labels and contexts; new hardware should use this + already-vetted KDF. + +- *Inline encryption key*: a derived key which the hardware directly provisions + to a keyslot of the inline encryption hardware, without exposing it to + software. In all known hardware, this will always be an AES-256-XTS key. + However, in principle other encryption algorithms could be supported too. + Hardware must derive distinct subkeys for each supported encryption algorithm. + +- *Software secret*: a derived key which the hardware returns to software so + that software can use it for cryptographic tasks that can't use inline + encryption. This value is cryptographically isolated from the inline + encryption key, i.e. knowing one doesn't reveal the other. (The KDF ensures + this.) Currently, the software secret is always 32 bytes and thus is suitable + for cryptographic applications that require up to a 256-bit security strength. + Some use cases (e.g. full-disk encryption) won't require the software secret. + +Example: in the case of fscrypt, the fscrypt master key (the key that protects a +particular set of encrypted directories) is made hardware-wrapped. The inline +encryption key is used as the file contents encryption key, while the software +secret (rather than the master key directly) is used to key fscrypt's KDF +(HKDF-SHA512) to derive other subkeys such as filenames encryption keys. + +Note that currently this design assumes a single inline encryption key per +hardware-wrapped key, without any further key derivation. Thus, in the case of +fscrypt, currently hardware-wrapped keys are only compatible with the "inline +encryption optimized" settings, which use one file contents encryption key per +encryption policy rather than one per file. This design could be extended to +make the hardware derive per-file keys using per-file nonces passed down the +storage stack, and in fact some hardware already supports this; future work is +planned to remove this limitation by adding the corresponding kernel support. + +Kernel support +-------------- + +The inline encryption support of the kernel's block layer ("blk-crypto") has +been extended to support hardware-wrapped keys as an alternative to raw keys, +when hardware support is available. This works in the following way: + +- A ``key_types_supported`` field is added to the crypto capabilities in + ``struct blk_crypto_profile``. This allows device drivers to declare that + they support raw keys, hardware-wrapped keys, or both. + +- ``struct blk_crypto_key`` can now contain a hardware-wrapped key as an + alternative to a raw key; a ``key_type`` field is added to + ``struct blk_crypto_config`` to distinguish between the different key types. + This allows users of blk-crypto to en/decrypt data using a hardware-wrapped + key in a way very similar to using a raw key. + +- A new method ``blk_crypto_ll_ops::derive_sw_secret`` is added. Device drivers + that support hardware-wrapped keys must implement this method. Users of + blk-crypto can call ``blk_crypto_derive_sw_secret()`` to access this method. + +- The programming and eviction of hardware-wrapped keys happens via + ``blk_crypto_ll_ops::keyslot_program`` and + ``blk_crypto_ll_ops::keyslot_evict``, just like it does for raw keys. If a + driver supports hardware-wrapped keys, then it must handle hardware-wrapped + keys being passed to these methods. + +blk-crypto-fallback doesn't support hardware-wrapped keys. Therefore, +hardware-wrapped keys can only be used with actual inline encryption hardware. + +All the above deals with hardware-wrapped keys in ephemerally-wrapped form only. +To get such keys in the first place, new block device ioctls have been added to +provide a generic interface to creating and preparing such keys: + +- ``BLKCRYPTOIMPORTKEY`` converts a raw key to long-term wrapped form. It takes + in a pointer to a ``struct blk_crypto_import_key_arg``. The caller must set + ``raw_key_ptr`` and ``raw_key_size`` to the pointer and size (in bytes) of the + raw key to import. On success, ``BLKCRYPTOIMPORTKEY`` returns 0 and writes + the resulting long-term wrapped key blob to the buffer pointed to by + ``lt_key_ptr``, which is of maximum size ``lt_key_size``. It also updates + ``lt_key_size`` to be the actual size of the key. On failure, it returns -1 + and sets errno. An errno of ``EOPNOTSUPP`` indicates that the block device + does not support hardware-wrapped keys. An errno of ``EOVERFLOW`` indicates + that the output buffer did not have enough space for the key blob. + +- ``BLKCRYPTOGENERATEKEY`` is like ``BLKCRYPTOIMPORTKEY``, but it has the + hardware generate the key instead of importing one. It takes in a pointer to + a ``struct blk_crypto_generate_key_arg``. + +- ``BLKCRYPTOPREPAREKEY`` converts a key from long-term wrapped form to + ephemerally-wrapped form. It takes in a pointer to a ``struct + blk_crypto_prepare_key_arg``. The caller must set ``lt_key_ptr`` and + ``lt_key_size`` to the pointer and size (in bytes) of the long-term wrapped + key blob to convert. On success, ``BLKCRYPTOPREPAREKEY`` returns 0 and writes + the resulting ephemerally-wrapped key blob to the buffer pointed to by + ``eph_key_ptr``, which is of maximum size ``eph_key_size``. It also updates + ``eph_key_size`` to be the actual size of the key. On failure, it returns -1 + and sets errno. Errno values of ``EOPNOTSUPP`` and ``EOVERFLOW`` mean the + same as they do for ``BLKCRYPTOIMPORTKEY``. An errno of ``EBADMSG`` indicates + that the long-term wrapped key is invalid. + +Userspace needs to use either ``BLKCRYPTOIMPORTKEY`` or ``BLKCRYPTOGENERATEKEY`` +once to create a key, and then ``BLKCRYPTOPREPAREKEY`` each time the key is +unlocked and added to the kernel. Note that these ioctls have no relevance for +raw keys; they are only for hardware-wrapped keys. + +Testability +----------- + +Both the hardware KDF and the inline encryption itself are well-defined +algorithms that don't depend on any secrets other than the unwrapped key. +Therefore, if the unwrapped key is known to software, these algorithms can be +reproduced in software in order to verify the ciphertext that is written to disk +by the inline encryption hardware. + +However, the unwrapped key will only be known to software for testing if the +"import" functionality is used. Proper testing is not possible in the +"generate" case where the hardware generates the key itself. The correct +operation of the "generate" mode thus relies on the security and correctness of +the hardware RNG and its use to generate the key, as well as the testing of the +"import" mode as that should cover all parts other than the key generation. + +For an example of a test that verifies the ciphertext written to disk in the +"import" mode, see the fscrypt hardware-wrapped key tests in xfstests, or +`Android's vts_kernel_encryption_test +<https://android.googlesource.com/platform/test/vts-testcase/kernel/+/refs/heads/main/encryption/>`_. diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 66dcfaae698b..d420be65882d 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -85,6 +85,8 @@ Code Seq# Include File Comments 0x10 20-2F arch/s390/include/uapi/asm/hypfs.h 0x12 all linux/fs.h BLK* ioctls linux/blkpg.h + linux/blkzoned.h + linux/blk-crypto.h 0x15 all linux/fs.h FS_IOC_* ioctls 0x1b all InfiniBand Subsystem <http://infiniband.sourceforge.net/> diff --git a/block/Makefile b/block/Makefile index 33748123710b..3a941dc0d27f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,7 +26,8 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \ + bio-integrity-auto.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/badblocks.c b/block/badblocks.c index db4ec8b9b2a8..ece64e76fe8f 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -528,51 +528,6 @@ out: } /* - * Return 'true' if the range indicated by 'bad' can be backward merged - * with the bad range (from the bad table) index by 'behind'. - */ -static bool can_merge_behind(struct badblocks *bb, - struct badblocks_context *bad, int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - - if ((s < BB_OFFSET(p[behind])) && - ((s + sectors) >= BB_OFFSET(p[behind])) && - ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && - BB_ACK(p[behind]) == bad->ack) - return true; - return false; -} - -/* - * Do backward merge for range indicated by 'bad' and the bad range - * (from the bad table) indexed by 'behind'. The return value is merged - * sectors from bad->len. - */ -static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, - int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - int merged = 0; - - WARN_ON(s >= BB_OFFSET(p[behind])); - WARN_ON((s + sectors) < BB_OFFSET(p[behind])); - - if (s < BB_OFFSET(p[behind])) { - merged = BB_OFFSET(p[behind]) - s; - p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); - - WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); - } - - return merged; -} - -/* * Return 'true' if the range indicated by 'bad' can be forward * merged with the bad range (from the bad table) indexed by 'prev'. */ @@ -745,7 +700,7 @@ static bool can_front_overwrite(struct badblocks *bb, int prev, *extra = 2; } - if ((bb->count + (*extra)) >= MAX_BADBLOCKS) + if ((bb->count + (*extra)) > MAX_BADBLOCKS) return false; return true; @@ -855,40 +810,60 @@ static void badblocks_update_acked(struct badblocks *bb) bb->unacked_exist = 0; } +/* + * Return 'true' if the range indicated by 'bad' is exactly backward + * overlapped with the bad range (from bad table) indexed by 'behind'. + */ +static bool try_adjacent_combine(struct badblocks *bb, int prev) +{ + u64 *p = bb->page; + + if (prev >= 0 && (prev + 1) < bb->count && + BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && + (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && + BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), + BB_ACK(p[prev])); + + if ((prev + 2) < bb->count) + memmove(p + prev + 1, p + prev + 2, + (bb->count - (prev + 2)) * 8); + bb->count--; + return true; + } + return false; +} + /* Do exact work to set bad block range into the bad block table */ -static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, + int acknowledged) { - int retried = 0, space_desired = 0; - int orig_len, len = 0, added = 0; + int len = 0, added = 0; struct badblocks_context bad; int prev = -1, hint = -1; - sector_t orig_start; unsigned long flags; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { /* round the start down, and the end up */ sector_t next = s + sectors; - rounddown(s, bb->shift); - roundup(next, bb->shift); + rounddown(s, 1 << bb->shift); + roundup(next, 1 << bb->shift); sectors = next - s; } write_seqlock_irqsave(&bb->lock, flags); - orig_start = s; - orig_len = sectors; bad.ack = acknowledged; p = bb->page; @@ -897,6 +872,9 @@ re_insert: bad.len = sectors; len = 0; + if (badblocks_full(bb)) + goto out; + if (badblocks_empty(bb)) { len = insert_at(bb, 0, &bad); bb->count++; @@ -908,32 +886,14 @@ re_insert: /* start before all badblocks */ if (prev < 0) { - if (!badblocks_full(bb)) { - /* insert on the first */ - if (bad.len > (BB_OFFSET(p[0]) - bad.start)) - bad.len = BB_OFFSET(p[0]) - bad.start; - len = insert_at(bb, 0, &bad); - bb->count++; - added++; - hint = 0; - goto update_sectors; - } - - /* No sapce, try to merge */ - if (overlap_behind(bb, &bad, 0)) { - if (can_merge_behind(bb, &bad, 0)) { - len = behind_merge(bb, &bad, 0); - added++; - } else { - len = BB_OFFSET(p[0]) - s; - space_desired = 1; - } - hint = 0; - goto update_sectors; - } - - /* no table space and give up */ - goto out; + /* insert on the first */ + if (bad.len > (BB_OFFSET(p[0]) - bad.start)) + bad.len = BB_OFFSET(p[0]) - bad.start; + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + hint = ++prev; + goto update_sectors; } /* in case p[prev-1] can be merged with p[prev] */ @@ -945,33 +905,6 @@ re_insert: goto update_sectors; } - if (overlap_front(bb, prev, &bad)) { - if (can_merge_front(bb, prev, &bad)) { - len = front_merge(bb, prev, &bad); - added++; - } else { - int extra = 0; - - if (!can_front_overwrite(bb, prev, &bad, &extra)) { - len = min_t(sector_t, - BB_END(p[prev]) - s, sectors); - hint = prev; - goto update_sectors; - } - - len = front_overwrite(bb, prev, &bad, extra); - added++; - bb->count += extra; - - if (can_combine_front(bb, prev, &bad)) { - front_combine(bb, prev); - bb->count--; - } - } - hint = prev; - goto update_sectors; - } - if (can_merge_front(bb, prev, &bad)) { len = front_merge(bb, prev, &bad); added++; @@ -979,21 +912,29 @@ re_insert: goto update_sectors; } - /* if no space in table, still try to merge in the covered range */ - if (badblocks_full(bb)) { - /* skip the cannot-merge range */ - if (((prev + 1) < bb->count) && - overlap_behind(bb, &bad, prev + 1) && - ((s + sectors) >= BB_END(p[prev + 1]))) { - len = BB_END(p[prev + 1]) - s; - hint = prev + 1; + if (overlap_front(bb, prev, &bad)) { + int extra = 0; + + if (!can_front_overwrite(bb, prev, &bad, &extra)) { + if (extra > 0) + goto out; + + len = min_t(sector_t, + BB_END(p[prev]) - s, sectors); + hint = prev; goto update_sectors; } - /* no retry any more */ - len = sectors; - space_desired = 1; - hint = -1; + len = front_overwrite(bb, prev, &bad, extra); + added++; + bb->count += extra; + + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; + } + + hint = prev; goto update_sectors; } @@ -1006,7 +947,7 @@ re_insert: len = insert_at(bb, prev + 1, &bad); bb->count++; added++; - hint = prev + 1; + hint = ++prev; update_sectors: s += len; @@ -1015,35 +956,12 @@ update_sectors: if (sectors > 0) goto re_insert; - WARN_ON(sectors < 0); - /* * Check whether the following already set range can be * merged. (prev < 0) condition is not handled here, * because it's already complicated enough. */ - if (prev >= 0 && - (prev + 1) < bb->count && - BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && - (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && - BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { - p[prev] = BB_MAKE(BB_OFFSET(p[prev]), - BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), - BB_ACK(p[prev])); - - if ((prev + 2) < bb->count) - memmove(p + prev + 1, p + prev + 2, - (bb->count - (prev + 2)) * 8); - bb->count--; - } - - if (space_desired && !badblocks_full(bb)) { - s = orig_start; - sectors = orig_len; - space_desired = 0; - if (retried++ < 3) - goto re_insert; - } + try_adjacent_combine(bb, prev); out: if (added) { @@ -1057,10 +975,7 @@ out: write_sequnlock_irqrestore(&bb->lock, flags); - if (!added) - rv = 1; - - return rv; + return sectors == 0; } /* @@ -1131,21 +1046,20 @@ static int front_splitting_clear(struct badblocks *bb, int prev, } /* Do the exact work to clear bad block range from the bad block table */ -static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { struct badblocks_context bad; int prev = -1, hint = -1; int len = 0, cleared = 0; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { sector_t target; @@ -1157,8 +1071,8 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) * isn't than to think a block is not bad when it is. */ target = s + sectors; - roundup(s, bb->shift); - rounddown(target, bb->shift); + roundup(s, 1 << bb->shift); + rounddown(target, 1 << bb->shift); sectors = target - s; } @@ -1214,7 +1128,7 @@ re_clear: if ((BB_OFFSET(p[prev]) < bad.start) && (BB_END(p[prev]) > (bad.start + bad.len))) { /* Splitting */ - if ((bb->count + 1) < MAX_BADBLOCKS) { + if ((bb->count + 1) <= MAX_BADBLOCKS) { len = front_splitting_clear(bb, prev, &bad); bb->count += 1; cleared++; @@ -1255,8 +1169,6 @@ update_sectors: if (sectors > 0) goto re_clear; - WARN_ON(sectors < 0); - if (cleared) { badblocks_update_acked(bb); set_changed(bb); @@ -1265,40 +1177,21 @@ update_sectors: write_sequnlock_irq(&bb->lock); if (!cleared) - rv = 1; + return false; - return rv; + return true; } /* Do the exact work to check bad blocks range from the bad block table */ -static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { - int unacked_badblocks, acked_badblocks; int prev = -1, hint = -1, set = 0; struct badblocks_context bad; - unsigned int seq; + int unacked_badblocks = 0; + int acked_badblocks = 0; + u64 *p = bb->page; int len, rv; - u64 *p; - - WARN_ON(bb->shift < 0 || sectors == 0); - - if (bb->shift > 0) { - sector_t target; - - /* round the start down, and the end up */ - target = s + sectors; - rounddown(s, bb->shift); - roundup(target, bb->shift); - sectors = target - s; - } - -retry: - seq = read_seqbegin(&bb->lock); - - p = bb->page; - unacked_badblocks = 0; - acked_badblocks = 0; re_check: bad.start = s; @@ -1349,14 +1242,15 @@ re_check: len = sectors; update_sectors: + /* This situation should never happen */ + WARN_ON(sectors < len); + s += len; sectors -= len; if (sectors > 0) goto re_check; - WARN_ON(sectors < 0); - if (unacked_badblocks > 0) rv = -1; else if (acked_badblocks > 0) @@ -1364,9 +1258,6 @@ update_sectors: else rv = 0; - if (read_seqretry(&bb->lock, seq)) - goto retry; - return rv; } @@ -1404,10 +1295,30 @@ update_sectors: * -1: there are bad blocks which have not yet been acknowledged in metadata. * plus the start/length of the first bad section we overlap. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { - return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + unsigned int seq; + int rv; + + WARN_ON(bb->shift < 0 || sectors == 0); + + if (bb->shift > 0) { + /* round the start down, and the end up */ + sector_t target = s + sectors; + + rounddown(s, 1 << bb->shift); + roundup(target, 1 << bb->shift); + sectors = target - s; + } + +retry: + seq = read_seqbegin(&bb->lock); + rv = _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; } EXPORT_SYMBOL_GPL(badblocks_check); @@ -1423,11 +1334,12 @@ EXPORT_SYMBOL_GPL(badblocks_check); * decide how best to handle it. * * Return: - * 0: success - * 1: failed to set badblocks (out of space) + * true: success + * false: failed to set badblocks (out of space). Parital setting will be + * treated as failure. */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, + int acknowledged) { return _badblocks_set(bb, s, sectors, acknowledged); } @@ -1444,10 +1356,10 @@ EXPORT_SYMBOL_GPL(badblocks_set); * drop the remove request. * * Return: - * 0: success - * 1: failed to clear badblocks + * true: success + * false: failed to clear badblocks */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { return _badblocks_clear(bb, s, sectors); } @@ -1479,6 +1391,11 @@ void ack_all_badblocks(struct badblocks *bb) p[i] = BB_MAKE(start, len, 1); } } + + for (i = 0; i < bb->count ; i++) + while (try_adjacent_combine(bb, i)) + ; + bb->unacked_exist = 0; } write_sequnlock_irq(&bb->lock); @@ -1564,10 +1481,10 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, return -EINVAL; } - if (badblocks_set(bb, sector, length, !unack)) + if (!badblocks_set(bb, sector, length, !unack)) return -ENOSPC; - else - return len; + + return len; } EXPORT_SYMBOL_GPL(badblocks_store); diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c new file mode 100644 index 000000000000..e524c609be50 --- /dev/null +++ b/block/bio-integrity-auto.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * Automatically generate and verify integrity data on PI capable devices if the + * bio submitter didn't provide PI itself. This ensures that kernel verifies + * data integrity even if the file system (or other user of the block device) is + * not aware of PI. + */ +#include <linux/blk-integrity.h> +#include <linux/workqueue.h> +#include "blk.h" + +struct bio_integrity_data { + struct bio *bio; + struct bvec_iter saved_bio_iter; + struct work_struct work; + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *bid_slab; +static mempool_t bid_pool; +static struct workqueue_struct *kintegrityd_wq; + +static void bio_integrity_finish(struct bio_integrity_data *bid) +{ + bid->bio->bi_integrity = NULL; + bid->bio->bi_opf &= ~REQ_INTEGRITY; + kfree(bvec_virt(bid->bip.bip_vec)); + mempool_free(bid, &bid_pool); +} + +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_data *bid = + container_of(work, struct bio_integrity_data, work); + struct bio *bio = bid->bio; + + blk_integrity_verify_iter(bio, &bid->saved_bio_iter); + bio_integrity_finish(bid); + bio_endio(bio); +} + +/** + * __bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * + * Normally I/O completion is done in interrupt context. However, verifying I/O + * integrity is a time-consuming task which must be run in process context. + * + * This function postpones completion accordingly. + */ +bool __bio_integrity_endio(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_data *bid = + container_of(bip, struct bio_integrity_data, bip); + + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { + INIT_WORK(&bid->work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bid->work); + return false; + } + + bio_integrity_finish(bid); + return true; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Checks if the bio already has an integrity payload attached. If it does, the + * payload has been generated by another kernel subsystem, and we just pass it + * through. + * Otherwise allocates integrity payload and for writes the integrity metadata + * will be generated. For reads, the completion handler will verify the + * metadata. + */ +bool bio_integrity_prep(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_data *bid; + gfp_t gfp = GFP_NOIO; + unsigned int len; + void *buf; + + if (!bi) + return true; + + if (!bio_sectors(bio)) + return true; + + /* Already protected? */ + if (bio_integrity(bio)) + return true; + + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) + return true; + break; + case REQ_OP_WRITE: + if (bi->flags & BLK_INTEGRITY_NOGENERATE) + return true; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + gfp |= __GFP_ZERO; + break; + default: + return true; + } + + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return true; + + /* Allocate kernel buffer for protection data */ + len = bio_integrity_bytes(bi, bio_sectors(bio)); + buf = kmalloc(len, gfp); + if (!buf) + goto err_end_io; + bid = mempool_alloc(&bid_pool, GFP_NOIO); + if (!bid) + goto err_free_buf; + bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); + + bid->bio = bio; + + bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); + + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bid->bip.bip_flags |= BIP_IP_CHECKSUM; + if (bi->csum_type) + bid->bip.bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bid->bip.bip_flags |= BIP_CHECK_REFTAG; + + if (bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)) < len) + goto err_end_io; + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + blk_integrity_generate(bio); + else + bid->saved_bio_iter = bio->bi_iter; + return true; + +err_free_buf: + kfree(buf); +err_end_io: + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return false; +} +EXPORT_SYMBOL(bio_integrity_prep); + +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + +static int __init blk_integrity_auto_init(void) +{ + bid_slab = kmem_cache_create("bio_integrity_data", + sizeof(struct bio_integrity_data), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) + panic("bio: can't create integrity pool\n"); + + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + return 0; +} +subsys_initcall(blk_integrity_auto_init); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5d81ad9a3d20..608594a154a5 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,20 +7,12 @@ */ #include <linux/blk-integrity.h> -#include <linux/mempool.h> -#include <linux/export.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/slab.h> #include "blk.h" -static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; - -void blk_flush_integrity(void) -{ - flush_workqueue(kintegrityd_wq); -} +struct bio_integrity_alloc { + struct bio_integrity_payload bip; + struct bio_vec bvecs[]; +}; /** * bio_integrity_free - Free bio integrity payload @@ -30,21 +22,23 @@ void blk_flush_integrity(void) */ void bio_integrity_free(struct bio *bio) { - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - - if (bs && mempool_initialized(&bs->bio_integrity_pool)) { - if (bip->bip_vec) - bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_max_vcnt); - mempool_free(bip, &bs->bio_integrity_pool); - } else { - kfree(bip); - } + kfree(bio_integrity(bio)); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs) +{ + memset(bip, 0, sizeof(*bip)); + bip->bip_max_vcnt = nr_vecs; + if (nr_vecs) + bip->bip_vec = bvecs; + + bio->bi_integrity = bip; + bio->bi_opf |= REQ_INTEGRITY; +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -59,48 +53,16 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned inline_vecs; + struct bio_integrity_alloc *bia; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); - if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { - bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!bip)) + bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask); + if (unlikely(!bia)) return ERR_PTR(-ENOMEM); - - memset(bip, 0, sizeof(*bip)); - - /* always report as many vecs as asked explicitly, not inline vecs */ - bip->bip_max_vcnt = nr_vecs; - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, - &bip->bip_max_vcnt, gfp_mask); - if (!bip->bip_vec) - goto err; - } else if (nr_vecs) { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_bio = bio; - bio->bi_integrity = bip; - bio->bi_opf |= REQ_INTEGRITY; - - return bip; -err: - if (bs && mempool_initialized(&bs->bio_integrity_pool)) - mempool_free(bip, &bs->bio_integrity_pool); - else - kfree(bip); - return ERR_PTR(-ENOMEM); + bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs); + return &bia->bip; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -414,149 +376,6 @@ int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) } /** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Checks if the bio already has an integrity payload attached. - * If it does, the payload has been generated by another kernel subsystem, - * and we just pass it through. Otherwise allocates integrity payload. - * The bio must have data direction, target device and start sector set priot - * to calling. In the WRITE case, integrity metadata will be generated using - * the block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -bool bio_integrity_prep(struct bio *bio) -{ - struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - unsigned int len; - void *buf; - gfp_t gfp = GFP_NOIO; - - if (!bi) - return true; - - if (!bio_sectors(bio)) - return true; - - /* Already protected? */ - if (bio_integrity(bio)) - return true; - - switch (bio_op(bio)) { - case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) - return true; - break; - case REQ_OP_WRITE: - if (bi->flags & BLK_INTEGRITY_NOGENERATE) - return true; - - /* - * Zero the memory allocated to not leak uninitialized kernel - * memory to disk for non-integrity metadata where nothing else - * initializes the memory. - */ - if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) - gfp |= __GFP_ZERO; - break; - default: - return true; - } - - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, gfp); - if (unlikely(buf == NULL)) { - goto err_end_io; - } - - bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (IS_ERR(bip)) { - kfree(buf); - goto err_end_io; - } - - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); - - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bip->bip_flags |= BIP_IP_CHECKSUM; - - /* describe what tags to check in payload */ - if (bi->csum_type) - bip->bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bip->bip_flags |= BIP_CHECK_REFTAG; - if (bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)) < len) { - printk(KERN_ERR "could not attach integrity payload\n"); - goto err_end_io; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) - blk_integrity_generate(bio); - else - bip->bio_iter = bio->bi_iter; - return true; - -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - - blk_integrity_verify(bio); - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - bio_endio(bio); -} - -/** - * __bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -bool __bio_integrity_endio(struct bio *bio) -{ - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); - return false; - } - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - return true; -} - -/** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update * @bytes_done: number of data bytes that have been completed @@ -617,44 +436,3 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (mempool_initialized(&bs->bio_integrity_pool)) - return 0; - - if (mempool_init_slab_pool(&bs->bio_integrity_pool, - pool_size, bip_slab)) - return -1; - - if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { - mempool_exit(&bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - mempool_exit(&bs->bio_integrity_pool); - mempool_exit(&bs->bvec_integrity_pool); -} - -void __init bio_integrity_init(void) -{ - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIO_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} diff --git a/block/bio.c b/block/bio.c index 6ac5983ba51e..4e6c85a33d74 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1026,9 +1026,10 @@ EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { + unsigned long nr = off / PAGE_SIZE; + WARN_ON_ONCE(len > UINT_MAX); - WARN_ON_ONCE(off > UINT_MAX); - __bio_add_page(bio, &folio->page, len, off); + __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); @@ -1049,9 +1050,11 @@ EXPORT_SYMBOL_GPL(bio_add_folio_nofail); bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { - if (len > UINT_MAX || off > UINT_MAX) + unsigned long nr = off / PAGE_SIZE; + + if (len > UINT_MAX) return false; - return bio_add_page(bio, &folio->page, len, off) > 0; + return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); @@ -1657,7 +1660,6 @@ void bioset_exit(struct bio_set *bs) mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); - bioset_integrity_free(bs); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; @@ -1737,8 +1739,6 @@ static int __init init_bio(void) BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); - bio_integrity_init(); - for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; @@ -1754,9 +1754,6 @@ static int __init init_bio(void) BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); - if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1994a11ff903..5905f277057b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -816,6 +816,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) ctx->bdev = bdev; return 0; } +/* + * Similar to blkg_conf_open_bdev, but additionally freezes the queue, + * acquires q->elevator_lock, and ensures the correct locking order + * between q->elevator_lock and q->rq_qos_mutex. + * + * This function returns negative error on failure. On success it returns + * memflags which must be saved and later passed to blkg_conf_exit_frozen + * for restoring the memalloc scope. + */ +unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) +{ + int ret; + unsigned long memflags; + + if (ctx->bdev) + return -EINVAL; + + ret = blkg_conf_open_bdev(ctx); + if (ret < 0) + return ret; + /* + * At this point, we haven’t started protecting anything related to QoS, + * so we release q->rq_qos_mutex here, which was first acquired in blkg_ + * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing + * the queue and acquiring q->elevator_lock to maintain the correct + * locking order. + */ + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + + memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); + mutex_lock(&ctx->bdev->bd_queue->elevator_lock); + mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); + + return memflags; +} /** * blkg_conf_prep - parse and prepare for per-blkg config update @@ -972,6 +1007,22 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_exit); +/* + * Similar to blkg_conf_exit, but also unfreezes the queue and releases + * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen + * is used to open the bdev. + */ +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) +{ + if (ctx->bdev) { + struct request_queue *q = ctx->bdev->bd_queue; + + blkg_conf_exit(ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + } +} + static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; @@ -1728,27 +1779,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ - ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + ret = -ENOSPC; goto err_unlock; } - /* - * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy - * without pd_alloc_fn/pd_free_fn can't be activated. - */ - if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) - goto err_unlock; - /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1759,8 +1810,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) + if (!cpd) { + ret = -ENOMEM; goto err_free_cpds; + } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2c4663bd993a..81868ad86330 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -219,9 +219,11 @@ struct blkg_conf_ctx { void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); +unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg diff --git a/block/blk-core.c b/block/blk-core.c index d6c4fa3943b5..4623de79effa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -429,6 +429,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); + mutex_init(&q->elevator_lock); mutex_init(&q->sysfs_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); @@ -455,6 +456,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", &q->q_lock_cls_key, 0); + /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */ + fs_reclaim_acquire(GFP_KERNEL); + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); + fs_reclaim_release(GFP_KERNEL); + q->nr_requests = BLKDEV_DEFAULT_RQ; return q; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 29a205482617..f154be0b575a 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -87,7 +87,7 @@ static struct bio_set crypto_bio_split; * This is the key we set when evicting a keyslot. This *should* be the all 0's * key, but AES-XTS rejects that key, so we use some random bytes instead. */ -static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; +static u8 blank_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { @@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, key->size); if (err) { blk_crypto_fallback_evict_keyslot(slot); @@ -539,7 +539,7 @@ static int blk_crypto_fallback_init(void) if (blk_crypto_fallback_inited) return 0; - get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + get_random_bytes(blank_key, sizeof(blank_key)); err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) @@ -561,6 +561,7 @@ static int blk_crypto_fallback_init(void) blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops; blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + blk_crypto_fallback_profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 93a141979694..ccf6dff6ff6b 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -14,6 +14,7 @@ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ + unsigned int security_strength; /* security strength in bytes */ unsigned int ivsize; /* iv size in bytes */ }; @@ -82,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -129,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq) return false; } +static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 7fabc883e39f..94a155912bf1 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, return false; if (profile->max_dun_bytes_supported < cfg->dun_bytes) return false; + if (!(profile->key_types_supported & cfg->key_type)) + return false; return true; } @@ -463,6 +465,99 @@ bool blk_crypto_register(struct blk_crypto_profile *profile, EXPORT_SYMBOL_GPL(blk_crypto_register); /** + * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key + * @bdev: a block device that supports hardware-wrapped keys + * @eph_key: a hardware-wrapped key in ephemerally-wrapped form + * @eph_key_size: size of @eph_key in bytes + * @sw_secret: (output) the software secret + * + * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that + * it is used for I/O), ask the hardware to derive the secret which software can + * use for cryptographic tasks other than inline encryption. This secret is + * guaranteed to be cryptographically isolated from the inline encryption key, + * i.e. derived with a different KDF context. + * + * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support + * hardware-wrapped keys, -EBADMSG if the key isn't a valid + * ephemerally-wrapped key, or another -errno code. + */ +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.derive_sw_secret) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size, + sw_secret); + blk_crypto_hw_exit(profile); + return err; +} + +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.import_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size, + lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.generate_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.generate_key(profile, lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.prepare_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size, + eph_key); + blk_crypto_hw_exit(profile); + return ret; +} + +/** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device * @parent: the crypto profile for the parent device @@ -485,10 +580,12 @@ void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, child->max_dun_bytes_supported); for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) parent->modes_supported[i] &= child->modes_supported[i]; + parent->key_types_supported &= child->key_types_supported; } else { parent->max_dun_bytes_supported = 0; memset(parent->modes_supported, 0, sizeof(parent->modes_supported)); + parent->key_types_supported = 0; } } EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); @@ -521,6 +618,9 @@ bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, target->max_dun_bytes_supported) return false; + if (reference->key_types_supported & ~target->key_types_supported) + return false; + return true; } EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); @@ -555,5 +655,6 @@ void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, sizeof(dst->modes_supported)); dst->max_dun_bytes_supported = src->max_dun_bytes_supported; + dst->key_types_supported = src->key_types_supported; } EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index a304434489ba..e832f403f200 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -31,6 +31,13 @@ static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) return container_of(attr, struct blk_crypto_attr, attr); } +static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page) { @@ -43,20 +50,48 @@ static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, return sysfs_emit(page, "%u\n", profile->num_slots); } +static ssize_t raw_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + #define BLK_CRYPTO_RO_ATTR(_name) \ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) +BLK_CRYPTO_RO_ATTR(hw_wrapped_keys); BLK_CRYPTO_RO_ATTR(max_dun_bits); BLK_CRYPTO_RO_ATTR(num_keyslots); +BLK_CRYPTO_RO_ATTR(raw_keys); + +static umode_t blk_crypto_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + if (a == &hw_wrapped_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return 0; + if (a == &raw_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_RAW)) + return 0; + + return 0444; +} static struct attribute *blk_crypto_attrs[] = { + &hw_wrapped_keys_attr.attr, &max_dun_bits_attr.attr, &num_keyslots_attr.attr, + &raw_keys_attr.attr, NULL, }; static const struct attribute_group blk_crypto_attr_group = { .attrs = blk_crypto_attrs, + .is_visible = blk_crypto_is_visible, }; /* diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 4d760b092deb..4b1ad84d1b5a 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -23,24 +23,28 @@ const struct blk_crypto_mode blk_crypto_modes[] = { .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, }, [BLK_ENCRYPTION_MODE_SM4_XTS] = { .name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, + .security_strength = 16, .ivsize = 16, }, }; @@ -76,9 +80,15 @@ static int __init bio_crypt_ctx_init(void) /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); - /* Sanity check that no algorithm exceeds the defined limits. */ + /* + * Validate the crypto mode properties. This ideally would be done with + * static assertions, but boot-time checks are the next best thing. + */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { - BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].keysize > + BLK_CRYPTO_MAX_RAW_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].security_strength > + blk_crypto_modes[i].keysize); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } @@ -315,17 +325,20 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. - * @raw_key: Pointer to the raw key. Must be the correct length for the chosen - * @crypto_mode; see blk_crypto_modes[]. + * @key_bytes: the bytes of the key + * @key_size: size of the key in bytes + * @key_type: type of the key -- either raw or hardware-wrapped * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used * @data_unit_size: the data unit size to use for en/decryption * * Return: 0 on success, -errno on failure. The caller is responsible for - * zeroizing both blk_key and raw_key when done with them. + * zeroizing both blk_key and key_bytes when done with them. */ -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *key_bytes, size_t key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) @@ -338,8 +351,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; - if (mode->keysize == 0) + switch (key_type) { + case BLK_CRYPTO_KEY_TYPE_RAW: + if (key_size != mode->keysize) + return -EINVAL; + break; + case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED: + if (key_size < mode->security_strength || + key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + return -EINVAL; + break; + default: return -EINVAL; + } if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; @@ -350,9 +374,10 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->crypto_cfg.key_type = key_type; blk_key->data_unit_size_bits = ilog2(data_unit_size); - blk_key->size = mode->keysize; - memcpy(blk_key->raw, raw_key, mode->keysize); + blk_key->size = key_size; + memcpy(blk_key->bytes, key_bytes, key_size); return 0; } @@ -372,8 +397,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev, bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { - return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_crypto_config_supported_natively(bdev, cfg); + if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) && + cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW) + return true; + return blk_crypto_config_supported_natively(bdev, cfg); } /** @@ -387,15 +414,21 @@ bool blk_crypto_config_supported(struct block_device *bdev, * an skcipher, and *should not* be called from the data path, since that might * cause a deadlock * - * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and - * blk-crypto-fallback is either disabled or the needed algorithm - * is disabled in the crypto API; or another -errno code. + * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does + * not support wrapped keys; -ENOPKG if the key is a raw key but the + * hardware does not support raw keys and blk-crypto-fallback is either + * disabled or the needed algorithm is disabled in the crypto API; or + * another -errno code if something else went wrong. */ int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; + if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) { + pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev); + return -EOPNOTSUPP; + } return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -436,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev, pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); + +static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_import_key_arg arg; + u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key)) + return -EINVAL; + + if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr), + arg.raw_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(raw_key, sizeof(raw_key)); + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_generate_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + ret = blk_crypto_generate_key(profile, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_prepare_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.lt_key_size > sizeof(lt_key)) + return -EINVAL; + + if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr), + arg.lt_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key); + if (ret < 0) + goto out; + if (ret > arg.eph_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.eph_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key, + arg.eph_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + memzero_explicit(eph_key, sizeof(eph_key)); + return ret; +} + +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + + if (!profile) + return -EOPNOTSUPP; + + switch (cmd) { + case BLKCRYPTOIMPORTKEY: + return blk_crypto_ioctl_import_key(profile, argp); + case BLKCRYPTOGENERATEKEY: + return blk_crypto_ioctl_generate_key(profile, argp); + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl_prepare_key(profile, argp); + default: + return -ENOTTY; + } +} diff --git a/block/blk-flush.c b/block/blk-flush.c index a72e2a83d075..43d6152897a4 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,9 +95,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +blk_get_flush_queue(struct blk_mq_ctx *ctx) { - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; + return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) @@ -205,7 +205,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, struct list_head *running; struct request *rq, *n; unsigned long flags = 0; - struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -341,7 +341,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; - struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(ctx); if (q->elevator) { WARN_ON(rq->tag < 0); @@ -382,7 +382,7 @@ static void blk_rq_init_flush(struct request *rq) bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx); bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ed11438eb4be..5bfd70311359 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2718,8 +2718,7 @@ retry_lock: * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); - wait.wait.private = current; + init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ @@ -3223,14 +3222,16 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; - unsigned int memflags; + unsigned long memflags; int ret; blkg_conf_init(&ctx, input); - ret = blkg_conf_open_bdev(&ctx); - if (ret) + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) { + ret = memflags; goto err; + } body = ctx.body; disk = ctx.bdev->bd_disk; @@ -3247,7 +3248,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } - memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3347,19 +3347,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue, memflags); - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); - blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue, memflags); - ret = -EINVAL; err: - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return ret; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 1d1589c35297..fdd4efb54c6c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -551,8 +551,8 @@ static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, * Map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries. */ -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg) { struct req_iterator iter = { .bio = rq->bio, diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index adf5f0697b6b..3421b5521fe2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -347,9 +347,14 @@ static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; + int res; + res = mutex_lock_interruptible(&hctx->queue->elevator_lock); + if (res) + return res; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, ¶ms); + mutex_unlock(&hctx->queue->elevator_lock); return 0; } @@ -400,15 +405,14 @@ static int hctx_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) @@ -417,15 +421,14 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_show(void *data, struct seq_file *m) @@ -434,15 +437,14 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) @@ -451,15 +453,14 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_active_show(void *data, struct seq_file *m) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 7442ca27c2bf..109611445d40 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -349,7 +349,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, } ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + hctx = blk_mq_map_queue(bio->bi_opf, ctx); type = hctx->type; if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 3feeeccf8a99..24656980f443 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return res; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index b9f417d980b4..d880c50629d6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -190,8 +190,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); - data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, - data->ctx); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index 40490ac88045..ae8494d88897 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -508,7 +508,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); if (q->elevator) { /* @@ -3314,6 +3314,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->nr_integrity_segments = rq_src->nr_integrity_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -4094,6 +4095,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&q->elevator_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -4198,6 +4201,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + + mutex_unlock(&q->elevator_lock); } /* @@ -4467,7 +4472,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4500,7 +4505,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4933,10 +4938,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, if (!qe) return false; - /* q->elevator needs protection from ->sysfs_lock */ - mutex_lock(&q->sysfs_lock); + /* Accessing q->elevator needs protection from ->elevator_lock. */ + mutex_lock(&q->elevator_lock); - /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); goto unlock; @@ -4950,7 +4954,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, list_add(&qe->node, head); elevator_disable(q); unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return true; } @@ -4980,11 +4984,11 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, diff --git a/block/blk-mq.h b/block/blk-mq.h index 44979e92b79f..3011a78cf16a 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -100,12 +100,10 @@ static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue - * @q: request queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ -static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - blk_opf_t opf, +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d4d4f4dc0e23..95982bc46ba1 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -196,7 +196,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) struct rq_qos_wait_data { struct wait_queue_entry wq; - struct task_struct *task; struct rq_wait *rqw; acquire_inflight_cb_t *cb; void *private_data; @@ -218,7 +217,20 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; - wake_up_process(data->task); + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). + */ + default_wake_function(curr, mode, wake_flags, key); + /* + * Note that the order of operations is important as finish_wait() + * tests whether @curr is removed without grabbing the lock. This + * should be the last thing to do to make sure we will not have a + * UAF access to @data. And the semantics of memory barrier in it + * also make sure the waiter will see the latest @data->got_token + * once list_empty_careful() in finish_wait() returns true. + */ list_del_init_careful(&curr->entry); return 1; } @@ -244,41 +256,55 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, cleanup_cb_t *cleanup_cb) { struct rq_qos_wait_data data = { - .wq = { - .func = rq_qos_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, - .rqw = rqw, - .cb = acquire_inflight_cb, - .private_data = private_data, + .rqw = rqw, + .cb = acquire_inflight_cb, + .private_data = private_data, + .got_token = false, }; - bool has_sleeper; + bool first_waiter; - has_sleeper = wq_has_sleeper(&rqw->wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + /* + * If there are no waiters in the waiting queue, try to increase the + * inflight counter if we can. Otherwise, prepare for adding ourselves + * to the waiting queue. + */ + if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data)) return; - has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + init_wait_func(&data.wq, rq_qos_wake_function); + first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + /* + * Make sure there is at least one inflight process; otherwise, waiters + * will never be woken up. Since there may be no inflight process before + * adding ourselves to the waiting queue above, we need to try to + * increase the inflight counter for ourselves. And it is sufficient to + * guarantee that at least the first waiter to enter the waiting queue + * will re-check the waiting condition before going to sleep, thus + * ensuring forward progress. + */ + if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + /* + * We raced with rq_qos_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + * + * Enough memory barrier in list_empty_careful() in + * finish_wait() is paired with list_del_init_careful() + * in rq_qos_wake_function() to make sure we will see + * the latest @data->got_token. + */ + if (data.got_token) + cleanup_cb(rqw, private_data); + return; + } + + /* we are now relying on the waker to increase our inflight counter. */ do { - /* The memory barrier in set_current_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { - finish_wait(&rqw->wait, &data.wq); - - /* - * We raced with rq_qos_wake_function() getting a token, - * which means we now have two. Put our local token - * and wake anyone else potentially waiting for one. - */ - if (data.got_token) - cleanup_cb(rqw, private_data); - return; - } io_schedule(); - has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); diff --git a/block/blk-settings.c b/block/blk-settings.c index b9c6f0ec1c49..6b2dbe645d23 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -21,7 +21,7 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { - q->rq_timeout = timeout; + WRITE_ONCE(q->rq_timeout, timeout); } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); @@ -114,9 +114,15 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) pr_warn("invalid PI settings.\n"); return -EINVAL; } + bi->flags |= BLK_INTEGRITY_NOGENERATE | BLK_INTEGRITY_NOVERIFY; return 0; } + if (lim->features & BLK_FEAT_BOUNCE_HIGH) { + pr_warn("no bounce buffer support for integrity metadata\n"); + return -EINVAL; + } + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; @@ -867,36 +873,28 @@ bool queue_limits_stack_integrity(struct queue_limits *t, if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) return true; - if (!ti->tuple_size) { - /* inherit the settings from the first underlying device */ - if (!(ti->flags & BLK_INTEGRITY_STACKED)) { - ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE | - (bi->flags & BLK_INTEGRITY_REF_TAG); - ti->csum_type = bi->csum_type; - ti->tuple_size = bi->tuple_size; - ti->pi_offset = bi->pi_offset; - ti->interval_exp = bi->interval_exp; - ti->tag_size = bi->tag_size; - goto done; - } - if (!bi->tuple_size) - goto done; + if (ti->flags & BLK_INTEGRITY_STACKED) { + if (ti->tuple_size != bi->tuple_size) + goto incompatible; + if (ti->interval_exp != bi->interval_exp) + goto incompatible; + if (ti->tag_size != bi->tag_size) + goto incompatible; + if (ti->csum_type != bi->csum_type) + goto incompatible; + if ((ti->flags & BLK_INTEGRITY_REF_TAG) != + (bi->flags & BLK_INTEGRITY_REF_TAG)) + goto incompatible; + } else { + ti->flags = BLK_INTEGRITY_STACKED; + ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | + (bi->flags & BLK_INTEGRITY_REF_TAG); + ti->csum_type = bi->csum_type; + ti->tuple_size = bi->tuple_size; + ti->pi_offset = bi->pi_offset; + ti->interval_exp = bi->interval_exp; + ti->tag_size = bi->tag_size; } - - if (ti->tuple_size != bi->tuple_size) - goto incompatible; - if (ti->interval_exp != bi->interval_exp) - goto incompatible; - if (ti->tag_size != bi->tag_size) - goto incompatible; - if (ti->csum_type != bi->csum_type) - goto incompatible; - if ((ti->flags & BLK_INTEGRITY_REF_TAG) != - (bi->flags & BLK_INTEGRITY_REF_TAG)) - goto incompatible; - -done: - ti->flags |= BLK_INTEGRITY_STACKED; return true; incompatible: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6f548a4376aa..a2882751f0d2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -23,10 +23,11 @@ struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); + ssize_t (*show_limit)(struct gendisk *disk, char *page); + ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); - void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; static ssize_t @@ -52,7 +53,12 @@ queue_var_store(unsigned long *var, const char *page, size_t count) static ssize_t queue_requests_show(struct gendisk *disk, char *page) { - return queue_var_show(disk->queue->nr_requests, page); + ssize_t ret; + + mutex_lock(&disk->queue->elevator_lock); + ret = queue_var_show(disk->queue->nr_requests, page); + mutex_unlock(&disk->queue->elevator_lock); + return ret; } static ssize_t @@ -60,27 +66,38 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; + unsigned int memflags; + struct request_queue *q = disk->queue; - if (!queue_is_mq(disk->queue)) + if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) - return err; - + ret = err; + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_ra_show(struct gendisk *disk, char *page) { - return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); + ssize_t ret; + + mutex_lock(&disk->queue->limits_lock); + ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); + mutex_unlock(&disk->queue->limits_lock); + + return ret; } static ssize_t @@ -88,11 +105,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; + unsigned int memflags; + struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; + /* + * ->ra_pages is protected by ->limits_lock because it is usually + * calculated from the queue limits by queue_limits_commit_update. + */ + mutex_lock(&q->limits_lock); + memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + mutex_unlock(&q->limits_lock); + blk_mq_unfreeze_queue(q, memflags); + return ret; } @@ -238,8 +266,9 @@ static ssize_t queue_poll_show(struct gendisk *disk, char *page) { if (queue_is_mq(disk->queue)) return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); + return sysfs_emit(page, "%u\n", - !!(disk->queue->limits.features & BLK_FEAT_POLL)); + !!(disk->queue->limits.features & BLK_FEAT_POLL)); } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) @@ -286,17 +315,21 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; + unsigned int memflags; + struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue); + memflags = blk_mq_freeze_queue(q); + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) - blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -316,11 +349,19 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; + unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; + /* + * Here we update two queue flags each using atomic bitops, although + * updating two flags isn't atomic it should be harmless as those flags + * are accessed individually using atomic test_bit operation. So we + * don't grab any lock while updating these flags. + */ + memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); @@ -331,6 +372,7 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } + blk_mq_unfreeze_queue(q, memflags); #endif return ret; } @@ -344,29 +386,43 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - if (!(disk->queue->limits.features & BLK_FEAT_POLL)) - return -EINVAL; + unsigned int memflags; + ssize_t ret = count; + struct request_queue *q = disk->queue; + + memflags = blk_mq_freeze_queue(q); + if (!(q->limits.features & BLK_FEAT_POLL)) { + ret = -EINVAL; + goto out; + } + pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); - return count; +out: + blk_mq_unfreeze_queue(q, memflags); + return ret; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); + return sysfs_emit(page, "%u\n", + jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout))); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int val; + unsigned int val, memflags; int err; + struct request_queue *q = disk->queue; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; - blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val)); + memflags = blk_mq_freeze_queue(q); + blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + blk_mq_unfreeze_queue(q, memflags); return count; } @@ -412,57 +468,55 @@ static struct queue_sysfs_entry _prefix##_entry = { \ .store = _prefix##_store, \ }; -#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ +#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ - .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ - .store_limit = _prefix##_store, \ + .attr = { .name = _name, .mode = 0444 }, \ + .show_limit = _prefix##_show, \ } -#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ +#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ - .load_module = _prefix##_load_module, \ - .store = _prefix##_store, \ + .show_limit = _prefix##_show, \ + .store_limit = _prefix##_store, \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); -QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); -QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); -QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler"); - -QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); -QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); -QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); -QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); -QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); - -QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); -QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); -QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); +QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_RW_ENTRY(elv_iosched, "scheduler"); + +QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); +QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); +QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); + +QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); -QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors, +QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, "atomic_write_boundary_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); -QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); -QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); -QUEUE_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); -QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); -QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); @@ -470,16 +524,16 @@ QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); -QUEUE_RO_ENTRY(queue_fua, "fua"); -QUEUE_RO_ENTRY(queue_dax, "dax"); +QUEUE_LIM_RO_ENTRY(queue_fua, "fua"); +QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); -QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); -QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); +QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); +QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { - .attr = {.name = "hw_sector_size", .mode = 0444 }, - .show = queue_logical_block_size_show, + .attr = {.name = "hw_sector_size", .mode = 0444 }, + .show_limit = queue_logical_block_size_show, }; QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); @@ -503,14 +557,24 @@ static ssize_t queue_var_store64(s64 *var, const char *page) static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { - if (!wbt_rq_qos(disk->queue)) - return -EINVAL; + ssize_t ret; + struct request_queue *q = disk->queue; - if (wbt_disabled(disk->queue)) - return sysfs_emit(page, "0\n"); + mutex_lock(&q->elevator_lock); + if (!wbt_rq_qos(q)) { + ret = -EINVAL; + goto out; + } + + if (wbt_disabled(q)) { + ret = sysfs_emit(page, "0\n"); + goto out; + } - return sysfs_emit(page, "%llu\n", - div_u64(wbt_get_min_lat(disk->queue), 1000)); + ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +out: + mutex_unlock(&q->elevator_lock); + return ret; } static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, @@ -520,6 +584,7 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, struct rq_qos *rqos; ssize_t ret; s64 val; + unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) @@ -527,20 +592,24 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + rqos = wbt_rq_qos(q); if (!rqos) { ret = wbt_init(disk); if (ret) - return ret; + goto out; } + ret = count; if (val == -1) val = wbt_default_latency_nsec(q); else if (val >= 0) val *= 1000ULL; if (wbt_get_min_lat(q) == val) - return count; + goto out; /* * Ensure that the queue is idled, in case the latency update @@ -552,8 +621,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, wbt_set_min_lat(q, val); blk_mq_unquiesce_queue(q); +out: + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); - return count; + return ret; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); @@ -561,7 +633,9 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { - &queue_ra_entry.attr, + /* + * Attributes which are protected with q->limits_lock. + */ &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, @@ -577,44 +651,58 @@ static struct attribute *queue_attrs[] = { &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, - &queue_discard_zeroes_data_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, - &queue_write_same_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, - &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, - &queue_nomerges_entry.attr, &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, - &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, - &queue_poll_delay_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, + &queue_ra_entry.attr, + + /* + * Attributes which don't require locking. + */ + &queue_discard_zeroes_data_entry.attr, + &queue_write_same_max_entry.attr, + &queue_nr_zones_entry.attr, + &queue_nomerges_entry.attr, + &queue_poll_entry.attr, + &queue_poll_delay_entry.attr, + NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { - &queue_requests_entry.attr, + /* + * Attributes which require some form of locking other than + * q->sysfs_lock. + */ &elv_iosched_entry.attr, - &queue_rq_affinity_entry.attr, - &queue_io_timeout_entry.attr, + &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif + /* + * Attributes which don't require locking. + */ + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, + NULL, }; @@ -664,14 +752,20 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); - ssize_t res; - if (!entry->show) + if (!entry->show && !entry->show_limit) return -EIO; - mutex_lock(&disk->queue->sysfs_lock); - res = entry->show(disk, page); - mutex_unlock(&disk->queue->sysfs_lock); - return res; + + if (entry->show_limit) { + ssize_t res; + + mutex_lock(&disk->queue->limits_lock); + res = entry->show_limit(disk, page); + mutex_unlock(&disk->queue->limits_lock); + return res; + } + + return entry->show(disk, page); } static ssize_t @@ -681,21 +775,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; - unsigned int memflags; - ssize_t res; if (!entry->store_limit && !entry->store) return -EIO; - /* - * If the attribute needs to load a module, do it before freezing the - * queue to ensure that the module file can be read when the request - * queue is the one for the device storing the module file. - */ - if (entry->load_module) - entry->load_module(disk, page, length); - if (entry->store_limit) { + ssize_t res; + struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); @@ -710,12 +796,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, return length; } - mutex_lock(&q->sysfs_lock); - memflags = blk_mq_freeze_queue(q); - res = entry->store(disk, page, length); - blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); - return res; + return entry->store(disk, page, length); } static const struct sysfs_ops queue_sysfs_ops = { @@ -784,18 +865,22 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_debugfs_remove; + ret = blk_crypto_sysfs_register(disk); + if (ret) + goto out_unregister_ia_ranges; + + mutex_lock(&q->elevator_lock); if (q->elevator) { ret = elv_register_queue(q, false); - if (ret) - goto out_unregister_ia_ranges; + if (ret) { + mutex_unlock(&q->elevator_lock); + goto out_crypto_sysfs_unregister; + } } - - ret = blk_crypto_sysfs_register(disk); - if (ret) - goto out_elv_unregister; + wbt_enable_default(disk); + mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); @@ -817,8 +902,8 @@ int blk_register_queue(struct gendisk *disk) return ret; -out_elv_unregister: - elv_unregister_queue(q); +out_crypto_sysfs_unregister: + blk_crypto_sysfs_unregister(disk); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: @@ -864,8 +949,11 @@ void blk_unregister_queue(struct gendisk *disk) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elv_unregister_queue(q); + mutex_unlock(&q->elevator_lock); + + mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8d149aff9fd0..91dab43c65ab 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -478,8 +478,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -498,16 +496,14 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, - bool clear_carryover) + bool clear) { - tg->bytes_disp[rw] = 0; - tg->io_disp[rw] = 0; + if (clear) { + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + } tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; - if (clear_carryover) { - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; - } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -599,29 +595,34 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); - if (!time_elapsed) + /* Don't trim slice until at least 2 slices are used */ + if (time_elapsed < tg->td->throtl_slice * 2) return; + /* + * The bio submission time may be a few jiffies more than the expected + * waiting time, due to 'extra_bytes' can't be divided in + * tg_within_bps_limit(), and also due to timer wakeup delay. In this + * case, adjust slice_start will discard the extra wait time, causing + * lower rate than expected. Therefore, other than the above rounddown, + * one extra slice is preserved for deviation. + */ + time_elapsed -= tg->td->throtl_slice; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed) + - tg->carryover_bytes[rw]; - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + - tg->carryover_ios[rw]; + time_elapsed); + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); if (bytes_trim <= 0 && io_trim <= 0) return; - tg->carryover_bytes[rw] = 0; if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; - tg->carryover_ios[rw] = 0; if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else @@ -636,7 +637,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) jiffies); } -static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +static void __tg_update_carryover(struct throtl_grp *tg, bool rw, + long long *bytes, int *ios) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); @@ -649,26 +651,28 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw) * configuration. */ if (bps_limit != U64_MAX) - tg->carryover_bytes[rw] += - calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) - tg->bytes_disp[rw]; if (iops_limit != UINT_MAX) - tg->carryover_ios[rw] += - calculate_io_allowed(iops_limit, jiffy_elapsed) - + *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) - tg->io_disp[rw]; + tg->bytes_disp[rw] -= *bytes; + tg->io_disp[rw] -= *ios; } static void tg_update_carryover(struct throtl_grp *tg) { + long long bytes[2] = {0}; + int ios[2] = {0}; + if (tg->service_queue.nr_queued[READ]) - __tg_update_carryover(tg, READ); + __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); if (tg->service_queue.nr_queued[WRITE]) - __tg_update_carryover(tg, WRITE); + __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); /* see comments in struct throtl_grp for meaning of these fields. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, - tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], - tg->carryover_ios[READ], tg->carryover_ios[WRITE]); + bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, @@ -686,8 +690,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + - tg->carryover_ios[rw]; + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; @@ -720,8 +723,7 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + - tg->carryover_bytes[rw]; + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; @@ -810,13 +812,10 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) tg->bytes_disp[rw] += bio_size; - tg->last_bytes_disp[rw] += bio_size; - } tg->io_disp[rw]++; - tg->last_io_disp[rw]++; } /** @@ -1614,13 +1613,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) return tg_may_dispatch(tg, bio, NULL); } -static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) -{ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) - tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); - tg->carryover_ios[rw]--; -} - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -1657,10 +1649,12 @@ bool __blk_throtl_bio(struct bio *bio) /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. - * Debts are handled by carryover_bytes/ios while - * calculating wait time. + * + * Charge and dispatch directly, and our throttle + * control algorithm is adaptive, and extra IO bytes + * will be throttled for paying the debt */ - tg_dispatch_in_debt(tg, bio, rw); + throtl_charge_bio(tg, bio); } else { /* if above limits, break to queue */ break; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 1a36d1278eea..7964cc041e06 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -102,12 +102,9 @@ struct throtl_grp { unsigned int iops[2]; /* Number of bytes dispatched in current slice */ - uint64_t bytes_disp[2]; + int64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; - - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; + int io_disp[2]; /* * The following two fields are updated when new configuration is diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6dfc659d22e2..f1754d07f7e0 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -136,8 +136,9 @@ enum { RWB_MIN_WRITE_SAMPLES = 3, /* - * If we have this number of consecutive windows with not enough - * information to scale up or down, scale up. + * If we have this number of consecutive windows without enough + * information to scale up or down, slowly return to center state + * (step == 0). */ RWB_UNKNOWN_BUMP = 5, }; @@ -446,9 +447,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) break; case LAT_UNKNOWN_WRITES: /* - * We started a the center step, but don't have a valid - * read/write sample, but we do have writes going on. - * Allow step to go negative, to increase write perf. + * We don't have a valid read/write sample, but we do have + * writes going on. Allow step to go negative, to increase + * write performance. */ scale_up(rwb); break; @@ -638,11 +639,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) __wbt_done(rqos, flags); } -/* - * May sleep, if we have exceeded the writeback limits. Caller can pass - * in an irq held spinlock, if it holds one when calling this function. - * If we do sleep, we'll release and re-grab it. - */ +/* May sleep, if we have exceeded the writeback limits. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); diff --git a/block/blk.h b/block/blk.h index 9cf9a0099416..006e3be433d2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -715,7 +715,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); void blk_integrity_generate(struct bio *bio); -void blk_integrity_verify(struct bio *bio); +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); diff --git a/block/bounce.c b/block/bounce.c index 0d898cd5ec49..09a9616cf209 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -41,8 +41,6 @@ static void init_bounce_bioset(void) ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); BUG_ON(ret); - if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) - BUG_ON(1); ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); BUG_ON(ret); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 93523d8f8195..9ceb5d0832f5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); - buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list); buf->payload_len = blk_rq_bytes(req); return 0; } diff --git a/block/elevator.c b/block/elevator.c index cd2ce4921601..b4d08026b02c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { @@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q) { struct elevator_queue *e = q->elevator; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); @@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) unsigned int memflags; int ret; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q) { unsigned int memflags; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -700,34 +700,44 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return ret; } -void elv_iosched_load_module(struct gendisk *disk, const char *buf, - size_t count) +static void elv_iosched_load_module(char *elevator_name) { - char elevator_name[ELV_NAME_MAX]; struct elevator_type *found; - const char *name; - - strscpy(elevator_name, buf, sizeof(elevator_name)); - name = strstrip(elevator_name); spin_lock(&elv_list_lock); - found = __elevator_find(name); + found = __elevator_find(elevator_name); spin_unlock(&elv_list_lock); if (!found) - request_module("%s-iosched", name); + request_module("%s-iosched", elevator_name); } ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; + char *name; int ret; + unsigned int memflags; + struct request_queue *q = disk->queue; + /* + * If the attribute needs to load a module, do it before freezing the + * queue to ensure that the module file can be read when the request + * queue is the one for the device storing the module file. + */ strscpy(elevator_name, buf, sizeof(elevator_name)); - ret = elevator_change(disk->queue, strstrip(elevator_name)); + name = strstrip(elevator_name); + + elv_iosched_load_module(name); + + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + ret = elevator_change(q, name); if (!ret) - return count; + ret = count; + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -738,6 +748,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; + mutex_lock(&q->elevator_lock); if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { @@ -755,6 +766,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); + mutex_unlock(&q->elevator_lock); + return len; } diff --git a/block/elevator.h b/block/elevator.h index e526662c5dbb..e4e44dfac503 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -148,8 +148,6 @@ extern void elv_unregister(struct elevator_type *); * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); -void elv_iosched_load_module(struct gendisk *disk, const char *page, - size_t count); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); diff --git a/block/genhd.c b/block/genhd.c index e9375e20d866..c2bd86cd09de 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -565,8 +565,11 @@ out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out_exit_elevator: - if (disk->queue->elevator) + if (disk->queue->elevator) { + mutex_lock(&disk->queue->elevator_lock); elevator_exit(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); + } return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); @@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk) blk_mq_quiesce_queue(q); if (q->elevator) { - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_exit(q); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } rq_qos_exit(q); blk_mq_unquiesce_queue(q); diff --git a/block/ioctl.c b/block/ioctl.c index 6554b728bae6..faa40f383e27 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -15,6 +15,7 @@ #include <linux/io_uring/cmd.h> #include <uapi/linux/blkdev.h> #include "blk.h" +#include "blk-crypto-internal.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) @@ -620,6 +621,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKCRYPTOIMPORTKEY: + case BLKCRYPTOGENERATEKEY: + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index dc31f2dfa414..0f0f8452609a 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -568,7 +568,7 @@ static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx); struct kyber_hctx_data *khd = hctx->sched_data; struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 9cc6b8c1eea4..b5ecddd5181a 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -50,8 +50,6 @@ int sgi_partition(struct parsed_partitions *state) p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { - /*printk("Dev %s SGI disklabel: bad magic %08x\n", - state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index ddf9e6def4b2..2419af76120f 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -74,8 +74,6 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { -/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } diff --git a/block/t10-pi.c b/block/t10-pi.c index 2577114ff20c..851db518ee5e 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -404,7 +404,7 @@ void blk_integrity_generate(struct bio *bio) } } -void blk_integrity_verify(struct bio *bio) +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); @@ -418,9 +418,9 @@ void blk_integrity_verify(struct bio *bio) */ iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; - iter.seed = bip->bio_iter.bi_sector; + iter.seed = saved_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) { + __bio_for_each_segment(bv, bio, bviter, *saved_iter) { void *kaddr = bvec_kmap_local(&bv); blk_status_t ret = BLK_STS_OK; diff --git a/crypto/Kconfig b/crypto/Kconfig index b8a436edf4c3..69bbe696b94d 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -141,6 +141,12 @@ config CRYPTO_ACOMP select CRYPTO_ALGAPI select CRYPTO_ACOMP2 +config CRYPTO_HKDF + tristate + select CRYPTO_SHA256 if !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS + select CRYPTO_SHA512 if !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS + select CRYPTO_HASH2 + config CRYPTO_MANAGER tristate "Cryptographic algorithm manager" select CRYPTO_MANAGER2 diff --git a/crypto/Makefile b/crypto/Makefile index 2f7d64eeb008..c95e95e75ad4 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o obj-$(CONFIG_CRYPTO_AKCIPHER2) += akcipher.o obj-$(CONFIG_CRYPTO_SIG2) += sig.o obj-$(CONFIG_CRYPTO_KPP2) += kpp.o +obj-$(CONFIG_CRYPTO_HKDF) += hkdf.o dh_generic-y := dh.o dh_generic-y += dh_helper.o diff --git a/crypto/hkdf.c b/crypto/hkdf.c new file mode 100644 index 000000000000..2434c5c42545 --- /dev/null +++ b/crypto/hkdf.c @@ -0,0 +1,573 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * + * Copyright 2019 Google LLC + */ + +#include <crypto/internal/hash.h> +#include <crypto/sha2.h> +#include <crypto/hkdf.h> +#include <linux/module.h> + +/* + * HKDF consists of two steps: + * + * 1. HKDF-Extract: extract a pseudorandom key from the input keying material + * and optional salt. + * 2. HKDF-Expand: expand the pseudorandom key into output keying material of + * any length, parameterized by an application-specific info string. + * + */ + +/** + * hkdf_extract - HKDF-Extract (RFC 5869 section 2.2) + * @hmac_tfm: an HMAC transform using the hash function desired for HKDF. The + * caller is responsible for setting the @prk afterwards. + * @ikm: input keying material + * @ikmlen: length of @ikm + * @salt: input salt value + * @saltlen: length of @salt + * @prk: resulting pseudorandom key + * + * Extracts a pseudorandom key @prk from the input keying material + * @ikm with length @ikmlen and salt @salt with length @saltlen. + * The length of @prk is given by the digest size of @hmac_tfm. + * For an 'unsalted' version of HKDF-Extract @salt must be set + * to all zeroes and @saltlen must be set to the length of @prk. + * + * Returns 0 on success with the pseudorandom key stored in @prk, + * or a negative errno value otherwise. + */ +int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, + unsigned int ikmlen, const u8 *salt, unsigned int saltlen, + u8 *prk) +{ + int err; + + err = crypto_shash_setkey(hmac_tfm, salt, saltlen); + if (!err) + err = crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); + + return err; +} +EXPORT_SYMBOL_GPL(hkdf_extract); + +/** + * hkdf_expand - HKDF-Expand (RFC 5869 section 2.3) + * @hmac_tfm: hash context keyed with pseudorandom key + * @info: application-specific information + * @infolen: length of @info + * @okm: output keying material + * @okmlen: length of @okm + * + * This expands the pseudorandom key, which was already keyed into @hmac_tfm, + * into @okmlen bytes of output keying material parameterized by the + * application-specific @info of length @infolen bytes. + * This is thread-safe and may be called by multiple threads in parallel. + * + * Returns 0 on success with output keying material stored in @okm, + * or a negative errno value otherwise. + */ +int hkdf_expand(struct crypto_shash *hmac_tfm, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) +{ + SHASH_DESC_ON_STACK(desc, hmac_tfm); + unsigned int i, hashlen = crypto_shash_digestsize(hmac_tfm); + int err; + const u8 *prev = NULL; + u8 counter = 1; + u8 tmp[HASH_MAX_DIGESTSIZE] = {}; + + if (WARN_ON(okmlen > 255 * hashlen)) + return -EINVAL; + + desc->tfm = hmac_tfm; + + for (i = 0; i < okmlen; i += hashlen) { + err = crypto_shash_init(desc); + if (err) + goto out; + + if (prev) { + err = crypto_shash_update(desc, prev, hashlen); + if (err) + goto out; + } + + if (infolen) { + err = crypto_shash_update(desc, info, infolen); + if (err) + goto out; + } + + BUILD_BUG_ON(sizeof(counter) != 1); + if (okmlen - i < hashlen) { + err = crypto_shash_finup(desc, &counter, 1, tmp); + if (err) + goto out; + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + err = crypto_shash_finup(desc, &counter, 1, &okm[i]); + if (err) + goto out; + } + counter++; + prev = &okm[i]; + } + err = 0; +out: + if (unlikely(err)) + memzero_explicit(okm, okmlen); /* so caller doesn't need to */ + shash_desc_zero(desc); + memzero_explicit(tmp, HASH_MAX_DIGESTSIZE); + return err; +} +EXPORT_SYMBOL_GPL(hkdf_expand); + +struct hkdf_testvec { + const char *test; + const u8 *ikm; + const u8 *salt; + const u8 *info; + const u8 *prk; + const u8 *okm; + u16 ikm_size; + u16 salt_size; + u16 info_size; + u16 prk_size; + u16 okm_size; +}; + +/* + * HKDF test vectors from RFC5869 + * + * Additional HKDF test vectors from + * https://github.com/brycx/Test-Vector-Generation/blob/master/HKDF/hkdf-hmac-sha2-test-vectors.md + */ +static const struct hkdf_testvec hkdf_sha256_tv[] = { + { + .test = "basic hdkf test", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x07\x77\x09\x36\x2c\x2e\x32\xdf\x0d\xdc\x3f\x0d\xc4\x7b\xba\x63" + "\x90\xb6\xc7\x3b\xb5\x0f\x9c\x31\x22\xec\x84\x4a\xd7\xc2\xb3\xe5", + .prk_size = 32, + .okm = "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a\x90\x43\x4f\x64\xd0\x36\x2f\x2a" + "\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf" + "\x34\x00\x72\x08\xd5\xb8\x87\x18\x58\x65", + .okm_size = 42, + }, { + .test = "hkdf test with long input", + .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", + .ikm_size = 80, + .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", + .salt_size = 80, + .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .info_size = 80, + .prk = "\x06\xa6\xb8\x8c\x58\x53\x36\x1a\x06\x10\x4c\x9c\xeb\x35\xb4\x5c" + "\xef\x76\x00\x14\x90\x46\x71\x01\x4a\x19\x3f\x40\xc1\x5f\xc2\x44", + .prk_size = 32, + .okm = "\xb1\x1e\x39\x8d\xc8\x03\x27\xa1\xc8\xe7\xf7\x8c\x59\x6a\x49\x34" + "\x4f\x01\x2e\xda\x2d\x4e\xfa\xd8\xa0\x50\xcc\x4c\x19\xaf\xa9\x7c" + "\x59\x04\x5a\x99\xca\xc7\x82\x72\x71\xcb\x41\xc6\x5e\x59\x0e\x09" + "\xda\x32\x75\x60\x0c\x2f\x09\xb8\x36\x77\x93\xa9\xac\xa3\xdb\x71" + "\xcc\x30\xc5\x81\x79\xec\x3e\x87\xc1\x4c\x01\xd5\xc1\xf3\x43\x4f" + "\x1d\x87", + .okm_size = 82, + }, { + .test = "hkdf test with zero salt and info", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = NULL, + .salt_size = 0, + .info = NULL, + .info_size = 0, + .prk = "\x19\xef\x24\xa3\x2c\x71\x7b\x16\x7f\x33\xa9\x1d\x6f\x64\x8b\xdf" + "\x96\x59\x67\x76\xaf\xdb\x63\x77\xac\x43\x4c\x1c\x29\x3c\xcb\x04", + .prk_size = 32, + .okm = "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f\x71\x5f\x80\x2a\x06\x3c\x5a\x31" + "\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d" + "\x9d\x20\x13\x95\xfa\xa4\xb6\x1a\x96\xc8", + .okm_size = 42, + }, { + .test = "hkdf test with short input", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 11, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x82\x65\xf6\x9d\x7f\xf7\xe5\x01\x37\x93\x01\x5c\xa0\xef\x92\x0c" + "\xb1\x68\x21\x99\xc8\xbc\x3a\x00\xda\x0c\xab\x47\xb7\xb0\x0f\xdf", + .prk_size = 32, + .okm = "\x58\xdc\xe1\x0d\x58\x01\xcd\xfd\xa8\x31\x72\x6b\xfe\xbc\xb7\x43" + "\xd1\x4a\x7e\xe8\x3a\xa0\x57\xa9\x3d\x59\xb0\xa1\x31\x7f\xf0\x9d" + "\x10\x5c\xce\xcf\x53\x56\x92\xb1\x4d\xd5", + .okm_size = 42, + }, { + .test = "unsalted hkdf test with zero info", + .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" + "\x0c\x0c\x0c\x0c\x0c\x0c", + .ikm_size = 22, + .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + .salt_size = 32, + .info = NULL, + .info_size = 0, + .prk = "\xaa\x84\x1e\x1f\x35\x74\xf3\x2d\x13\xfb\xa8\x00\x5f\xcd\x9b\x8d" + "\x77\x67\x82\xa5\xdf\xa1\x92\x38\x92\xfd\x8b\x63\x5d\x3a\x89\xdf", + .prk_size = 32, + .okm = "\x59\x68\x99\x17\x9a\xb1\xbc\x00\xa7\xc0\x37\x86\xff\x43\xee\x53" + "\x50\x04\xbe\x2b\xb9\xbe\x68\xbc\x14\x06\x63\x6f\x54\xbd\x33\x8a" + "\x66\xa2\x37\xba\x2a\xcb\xce\xe3\xc9\xa7", + .okm_size = 42, + } +}; + +static const struct hkdf_testvec hkdf_sha384_tv[] = { + { + .test = "basic hkdf test", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x70\x4b\x39\x99\x07\x79\xce\x1d\xc5\x48\x05\x2c\x7d\xc3\x9f\x30" + "\x35\x70\xdd\x13\xfb\x39\xf7\xac\xc5\x64\x68\x0b\xef\x80\xe8\xde" + "\xc7\x0e\xe9\xa7\xe1\xf3\xe2\x93\xef\x68\xec\xeb\x07\x2a\x5a\xde", + .prk_size = 48, + .okm = "\x9b\x50\x97\xa8\x60\x38\xb8\x05\x30\x90\x76\xa4\x4b\x3a\x9f\x38" + "\x06\x3e\x25\xb5\x16\xdc\xbf\x36\x9f\x39\x4c\xfa\xb4\x36\x85\xf7" + "\x48\xb6\x45\x77\x63\xe4\xf0\x20\x4f\xc5", + .okm_size = 42, + }, { + .test = "hkdf test with long input", + .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", + .ikm_size = 80, + .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", + .salt_size = 80, + .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .info_size = 80, + .prk = "\xb3\x19\xf6\x83\x1d\xff\x93\x14\xef\xb6\x43\xba\xa2\x92\x63\xb3" + "\x0e\x4a\x8d\x77\x9f\xe3\x1e\x9c\x90\x1e\xfd\x7d\xe7\x37\xc8\x5b" + "\x62\xe6\x76\xd4\xdc\x87\xb0\x89\x5c\x6a\x7d\xc9\x7b\x52\xce\xbb", + .prk_size = 48, + .okm = "\x48\x4c\xa0\x52\xb8\xcc\x72\x4f\xd1\xc4\xec\x64\xd5\x7b\x4e\x81" + "\x8c\x7e\x25\xa8\xe0\xf4\x56\x9e\xd7\x2a\x6a\x05\xfe\x06\x49\xee" + "\xbf\x69\xf8\xd5\xc8\x32\x85\x6b\xf4\xe4\xfb\xc1\x79\x67\xd5\x49" + "\x75\x32\x4a\x94\x98\x7f\x7f\x41\x83\x58\x17\xd8\x99\x4f\xdb\xd6" + "\xf4\xc0\x9c\x55\x00\xdc\xa2\x4a\x56\x22\x2f\xea\x53\xd8\x96\x7a" + "\x8b\x2e", + .okm_size = 82, + }, { + .test = "hkdf test with zero salt and info", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = NULL, + .salt_size = 0, + .info = NULL, + .info_size = 0, + .prk = "\x10\xe4\x0c\xf0\x72\xa4\xc5\x62\x6e\x43\xdd\x22\xc1\xcf\x72\x7d" + "\x4b\xb1\x40\x97\x5c\x9a\xd0\xcb\xc8\xe4\x5b\x40\x06\x8f\x8f\x0b" + "\xa5\x7c\xdb\x59\x8a\xf9\xdf\xa6\x96\x3a\x96\x89\x9a\xf0\x47\xe5", + .prk_size = 48, + .okm = "\xc8\xc9\x6e\x71\x0f\x89\xb0\xd7\x99\x0b\xca\x68\xbc\xde\xc8\xcf" + "\x85\x40\x62\xe5\x4c\x73\xa7\xab\xc7\x43\xfa\xde\x9b\x24\x2d\xaa" + "\xcc\x1c\xea\x56\x70\x41\x5b\x52\x84\x9c", + .okm_size = 42, + }, { + .test = "hkdf test with short input", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 11, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x6d\x31\x69\x98\x28\x79\x80\x88\xb3\x59\xda\xd5\x0b\x8f\x01\xb0" + "\x15\xf1\x7a\xa3\xbd\x4e\x27\xa6\xe9\xf8\x73\xb7\x15\x85\xca\x6a" + "\x00\xd1\xf0\x82\x12\x8a\xdb\x3c\xf0\x53\x0b\x57\xc0\xf9\xac\x72", + .prk_size = 48, + .okm = "\xfb\x7e\x67\x43\xeb\x42\xcd\xe9\x6f\x1b\x70\x77\x89\x52\xab\x75" + "\x48\xca\xfe\x53\x24\x9f\x7f\xfe\x14\x97\xa1\x63\x5b\x20\x1f\xf1" + "\x85\xb9\x3e\x95\x19\x92\xd8\x58\xf1\x1a", + .okm_size = 42, + }, { + .test = "unsalted hkdf test with zero info", + .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" + "\x0c\x0c\x0c\x0c\x0c\x0c", + .ikm_size = 22, + .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + .salt_size = 48, + .info = NULL, + .info_size = 0, + .prk = "\x9d\x2d\xa5\x06\x6f\x05\xd1\x6c\x59\xfe\xdf\x6c\x5f\x32\xc7\x5e" + "\xda\x9a\x47\xa7\x9c\x93\x6a\xa4\x4c\xb7\x63\xa8\xe2\x2f\xfb\xfc" + "\xd8\xfe\x55\x43\x58\x53\x47\x21\x90\x39\xd1\x68\x28\x36\x33\xf5", + .prk_size = 48, + .okm = "\x6a\xd7\xc7\x26\xc8\x40\x09\x54\x6a\x76\xe0\x54\x5d\xf2\x66\x78" + "\x7e\x2b\x2c\xd6\xca\x43\x73\xa1\xf3\x14\x50\xa7\xbd\xf9\x48\x2b" + "\xfa\xb8\x11\xf5\x54\x20\x0e\xad\x8f\x53", + .okm_size = 42, + } +}; + +static const struct hkdf_testvec hkdf_sha512_tv[] = { + { + .test = "basic hkdf test", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x66\x57\x99\x82\x37\x37\xde\xd0\x4a\x88\xe4\x7e\x54\xa5\x89\x0b" + "\xb2\xc3\xd2\x47\xc7\xa4\x25\x4a\x8e\x61\x35\x07\x23\x59\x0a\x26" + "\xc3\x62\x38\x12\x7d\x86\x61\xb8\x8c\xf8\x0e\xf8\x02\xd5\x7e\x2f" + "\x7c\xeb\xcf\x1e\x00\xe0\x83\x84\x8b\xe1\x99\x29\xc6\x1b\x42\x37", + .prk_size = 64, + .okm = "\x83\x23\x90\x08\x6c\xda\x71\xfb\x47\x62\x5b\xb5\xce\xb1\x68\xe4" + "\xc8\xe2\x6a\x1a\x16\xed\x34\xd9\xfc\x7f\xe9\x2c\x14\x81\x57\x93" + "\x38\xda\x36\x2c\xb8\xd9\xf9\x25\xd7\xcb", + .okm_size = 42, + }, { + .test = "hkdf test with long input", + .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", + .ikm_size = 80, + .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", + .salt_size = 80, + .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .info_size = 80, + .prk = "\x35\x67\x25\x42\x90\x7d\x4e\x14\x2c\x00\xe8\x44\x99\xe7\x4e\x1d" + "\xe0\x8b\xe8\x65\x35\xf9\x24\xe0\x22\x80\x4a\xd7\x75\xdd\xe2\x7e" + "\xc8\x6c\xd1\xe5\xb7\xd1\x78\xc7\x44\x89\xbd\xbe\xb3\x07\x12\xbe" + "\xb8\x2d\x4f\x97\x41\x6c\x5a\x94\xea\x81\xeb\xdf\x3e\x62\x9e\x4a", + .prk_size = 64, + .okm = "\xce\x6c\x97\x19\x28\x05\xb3\x46\xe6\x16\x1e\x82\x1e\xd1\x65\x67" + "\x3b\x84\xf4\x00\xa2\xb5\x14\xb2\xfe\x23\xd8\x4c\xd1\x89\xdd\xf1" + "\xb6\x95\xb4\x8c\xbd\x1c\x83\x88\x44\x11\x37\xb3\xce\x28\xf1\x6a" + "\xa6\x4b\xa3\x3b\xa4\x66\xb2\x4d\xf6\xcf\xcb\x02\x1e\xcf\xf2\x35" + "\xf6\xa2\x05\x6c\xe3\xaf\x1d\xe4\x4d\x57\x20\x97\xa8\x50\x5d\x9e" + "\x7a\x93", + .okm_size = 82, + }, { + .test = "hkdf test with zero salt and info", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = NULL, + .salt_size = 0, + .info = NULL, + .info_size = 0, + .prk = "\xfd\x20\x0c\x49\x87\xac\x49\x13\x13\xbd\x4a\x2a\x13\x28\x71\x21" + "\x24\x72\x39\xe1\x1c\x9e\xf8\x28\x02\x04\x4b\x66\xef\x35\x7e\x5b" + "\x19\x44\x98\xd0\x68\x26\x11\x38\x23\x48\x57\x2a\x7b\x16\x11\xde" + "\x54\x76\x40\x94\x28\x63\x20\x57\x8a\x86\x3f\x36\x56\x2b\x0d\xf6", + .prk_size = 64, + .okm = "\xf5\xfa\x02\xb1\x82\x98\xa7\x2a\x8c\x23\x89\x8a\x87\x03\x47\x2c" + "\x6e\xb1\x79\xdc\x20\x4c\x03\x42\x5c\x97\x0e\x3b\x16\x4b\xf9\x0f" + "\xff\x22\xd0\x48\x36\xd0\xe2\x34\x3b\xac", + .okm_size = 42, + }, { + .test = "hkdf test with short input", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 11, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x67\x40\x9c\x9c\xac\x28\xb5\x2e\xe9\xfa\xd9\x1c\x2f\xda\x99\x9f" + "\x7c\xa2\x2e\x34\x34\xf0\xae\x77\x28\x63\x83\x65\x68\xad\x6a\x7f" + "\x10\xcf\x11\x3b\xfd\xdd\x56\x01\x29\xa5\x94\xa8\xf5\x23\x85\xc2" + "\xd6\x61\xd7\x85\xd2\x9c\xe9\x3a\x11\x40\x0c\x92\x06\x83\x18\x1d", + .prk_size = 64, + .okm = "\x74\x13\xe8\x99\x7e\x02\x06\x10\xfb\xf6\x82\x3f\x2c\xe1\x4b\xff" + "\x01\x87\x5d\xb1\xca\x55\xf6\x8c\xfc\xf3\x95\x4d\xc8\xaf\xf5\x35" + "\x59\xbd\x5e\x30\x28\xb0\x80\xf7\xc0\x68", + .okm_size = 42, + }, { + .test = "unsalted hkdf test with zero info", + .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" + "\x0c\x0c\x0c\x0c\x0c\x0c", + .ikm_size = 22, + .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + .salt_size = 64, + .info = NULL, + .info_size = 0, + .prk = "\x53\x46\xb3\x76\xbf\x3a\xa9\xf8\x4f\x8f\x6e\xd5\xb1\xc4\xf4\x89" + "\x17\x2e\x24\x4d\xac\x30\x3d\x12\xf6\x8e\xcc\x76\x6e\xa6\x00\xaa" + "\x88\x49\x5e\x7f\xb6\x05\x80\x31\x22\xfa\x13\x69\x24\xa8\x40\xb1" + "\xf0\x71\x9d\x2d\x5f\x68\xe2\x9b\x24\x22\x99\xd7\x58\xed\x68\x0c", + .prk_size = 64, + .okm = "\x14\x07\xd4\x60\x13\xd9\x8b\xc6\xde\xce\xfc\xfe\xe5\x5f\x0f\x90" + "\xb0\xc7\xf6\x3d\x68\xeb\x1a\x80\xea\xf0\x7e\x95\x3c\xfc\x0a\x3a" + "\x52\x40\xa1\x55\xd6\xe4\xda\xa9\x65\xbb", + .okm_size = 42, + } +}; + +static int hkdf_test(const char *shash, const struct hkdf_testvec *tv) +{ struct crypto_shash *tfm = NULL; + u8 *prk = NULL, *okm = NULL; + unsigned int prk_size; + const char *driver; + int err; + + tfm = crypto_alloc_shash(shash, 0, 0); + if (IS_ERR(tfm)) { + pr_err("%s(%s): failed to allocate transform: %ld\n", + tv->test, shash, PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + driver = crypto_shash_driver_name(tfm); + + prk_size = crypto_shash_digestsize(tfm); + prk = kzalloc(prk_size, GFP_KERNEL); + if (!prk) { + err = -ENOMEM; + goto out_free; + } + + if (tv->prk_size != prk_size) { + pr_err("%s(%s): prk size mismatch (vec %u, digest %u\n", + tv->test, driver, tv->prk_size, prk_size); + err = -EINVAL; + goto out_free; + } + + err = hkdf_extract(tfm, tv->ikm, tv->ikm_size, + tv->salt, tv->salt_size, prk); + if (err) { + pr_err("%s(%s): hkdf_extract failed with %d\n", + tv->test, driver, err); + goto out_free; + } + + if (memcmp(prk, tv->prk, tv->prk_size)) { + pr_err("%s(%s): hkdf_extract prk mismatch\n", + tv->test, driver); + print_hex_dump(KERN_ERR, "prk: ", DUMP_PREFIX_NONE, + 16, 1, prk, tv->prk_size, false); + err = -EINVAL; + goto out_free; + } + + okm = kzalloc(tv->okm_size, GFP_KERNEL); + if (!okm) { + err = -ENOMEM; + goto out_free; + } + + err = crypto_shash_setkey(tfm, tv->prk, tv->prk_size); + if (err) { + pr_err("%s(%s): failed to set prk, error %d\n", + tv->test, driver, err); + goto out_free; + } + + err = hkdf_expand(tfm, tv->info, tv->info_size, + okm, tv->okm_size); + if (err) { + pr_err("%s(%s): hkdf_expand() failed with %d\n", + tv->test, driver, err); + } else if (memcmp(okm, tv->okm, tv->okm_size)) { + pr_err("%s(%s): hkdf_expand() okm mismatch\n", + tv->test, driver); + print_hex_dump(KERN_ERR, "okm: ", DUMP_PREFIX_NONE, + 16, 1, okm, tv->okm_size, false); + err = -EINVAL; + } +out_free: + kfree(okm); + kfree(prk); + crypto_free_shash(tfm); + return err; +} + +static int __init crypto_hkdf_module_init(void) +{ + int ret = 0, i; + + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return 0; + + for (i = 0; i < ARRAY_SIZE(hkdf_sha256_tv); i++) { + ret = hkdf_test("hmac(sha256)", &hkdf_sha256_tv[i]); + if (ret) + return ret; + } + for (i = 0; i < ARRAY_SIZE(hkdf_sha384_tv); i++) { + ret = hkdf_test("hmac(sha384)", &hkdf_sha384_tv[i]); + if (ret) + return ret; + } + for (i = 0; i < ARRAY_SIZE(hkdf_sha512_tv); i++) { + ret = hkdf_test("hmac(sha512)", &hkdf_sha512_tv[i]); + if (ret) + return ret; + } + return 0; +} + +static void __exit crypto_hkdf_module_exit(void) {} + +module_init(crypto_hkdf_module_init); +module_exit(crypto_hkdf_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("HMAC-based Key Derivation Function (HKDF)"); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c05fe27a96b6..674527d770dc 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -45,8 +45,6 @@ enum { Lo_deleting, }; -struct loop_func_table; - struct loop_device { int lo_number; loff_t lo_offset; @@ -54,7 +52,8 @@ struct loop_device { int lo_flags; char lo_file_name[LO_NAME_SIZE]; - struct file * lo_backing_file; + struct file *lo_backing_file; + unsigned int lo_min_dio_size; struct block_device *lo_device; gfp_t old_gfp_mask; @@ -169,29 +168,14 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) * of backing device, and the logical block size of loop is bigger than that of * the backing device. */ -static bool lo_bdev_can_use_dio(struct loop_device *lo, - struct block_device *backing_bdev) -{ - unsigned int sb_bsize = bdev_logical_block_size(backing_bdev); - - if (queue_logical_block_size(lo->lo_queue) < sb_bsize) - return false; - if (lo->lo_offset & (sb_bsize - 1)) - return false; - return true; -} - static bool lo_can_use_dio(struct loop_device *lo) { - struct inode *inode = lo->lo_backing_file->f_mapping->host; - if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) return false; - - if (S_ISBLK(inode->i_mode)) - return lo_bdev_can_use_dio(lo, I_BDEV(inode)); - if (inode->i_sb->s_bdev) - return lo_bdev_can_use_dio(lo, inode->i_sb->s_bdev); + if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size) + return false; + if (lo->lo_offset & (lo->lo_min_dio_size - 1)) + return false; return true; } @@ -205,20 +189,12 @@ static bool lo_can_use_dio(struct loop_device *lo) */ static inline void loop_update_dio(struct loop_device *lo) { - bool dio_in_use = lo->lo_flags & LO_FLAGS_DIRECT_IO; - lockdep_assert_held(&lo->lo_mutex); WARN_ON_ONCE(lo->lo_state == Lo_bound && lo->lo_queue->mq_freeze_depth == 0); - if (lo->lo_backing_file->f_flags & O_DIRECT) - lo->lo_flags |= LO_FLAGS_DIRECT_IO; if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - - /* flush dirty pages before starting to issue direct I/O */ - if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !dio_in_use) - vfs_fsync(lo->lo_backing_file, 0); } /** @@ -541,6 +517,28 @@ static void loop_reread_partitions(struct loop_device *lo) __func__, lo->lo_number, lo->lo_file_name, rc); } +static unsigned int loop_query_min_dio_size(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev; + struct kstat st; + + /* + * Use the minimal dio alignment of the file system if provided. + */ + if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) && + (st.result_mask & STATX_DIOALIGN)) + return st.dio_offset_align; + + /* + * In a perfect world this wouldn't be needed, but as of Linux 6.13 only + * a handful of file systems support the STATX_DIOALIGN flag. + */ + if (sb_bdev) + return bdev_logical_block_size(sb_bdev); + return SECTOR_SIZE; +} + static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; @@ -573,6 +571,17 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) return 0; } +static void loop_assign_backing_file(struct loop_device *lo, struct file *file) +{ + lo->lo_backing_file = file; + lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); + mapping_set_gfp_mask(file->f_mapping, + lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); + if (lo->lo_backing_file->f_flags & O_DIRECT) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + lo->lo_min_dio_size = loop_query_min_dio_size(lo); +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -622,14 +631,18 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) goto out_err; + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); + /* and ... switch */ disk_force_media_change(lo->lo_disk); memflags = blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); - mapping_set_gfp_mask(file->f_mapping, - lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue, memflags); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; @@ -971,12 +984,11 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } -static unsigned int loop_default_blocksize(struct loop_device *lo, - struct block_device *backing_bdev) +static unsigned int loop_default_blocksize(struct loop_device *lo) { - /* In case of direct I/O, match underlying block size */ - if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev) - return bdev_logical_block_size(backing_bdev); + /* In case of direct I/O, match underlying minimum I/O size */ + if (lo->lo_flags & LO_FLAGS_DIRECT_IO) + return lo->lo_min_dio_size; return SECTOR_SIZE; } @@ -994,7 +1006,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, backing_bdev = inode->i_sb->s_bdev; if (!bsize) - bsize = loop_default_blocksize(lo, backing_bdev); + bsize = loop_default_blocksize(lo); loop_get_discard_config(lo, &granularity, &max_discard_sectors); @@ -1019,7 +1031,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, const struct loop_config *config) { struct file *file = fget(config->fd); - struct address_space *mapping; struct queue_limits lim; int error; loff_t size; @@ -1055,8 +1066,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, if (error) goto out_unlock; - mapping = file->f_mapping; - if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; @@ -1088,9 +1097,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->lo_device = bdev; - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); lim = queue_limits_start_update(lo->lo_queue); loop_update_limits(lo, &lim, config->block_size); @@ -1099,6 +1106,13 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, if (error) goto out_unlock; + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); + loop_update_dio(lo); loop_sysfs_init(lo); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 95361099a2dc..0d619df03fa9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2056,7 +2056,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, unsigned int nents; /* Map the scatter list for DMA access */ - nents = blk_rq_map_sg(hctx->queue, rq, command->sg); + nents = blk_rq_map_sg(rq, command->sg); nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); prefetch(&port->flags); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index da1ecbf988b8..3bb9cee0a9b5 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -473,6 +473,8 @@ NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); NULLB_DEVICE_ATTR(rotational, bool, NULL); +NULLB_DEVICE_ATTR(badblocks_once, bool, NULL); +NULLB_DEVICE_ATTR(badblocks_partial_io, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -559,14 +561,14 @@ static ssize_t nullb_device_badblocks_store(struct config_item *item, goto out; /* enable badblocks */ cmpxchg(&t_dev->badblocks.shift, -1, 0); - if (buf[0] == '+') - ret = badblocks_set(&t_dev->badblocks, start, - end - start + 1, 1); - else - ret = badblocks_clear(&t_dev->badblocks, start, - end - start + 1); - if (ret == 0) + if (buf[0] == '+') { + if (badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1)) + ret = count; + } else if (badblocks_clear(&t_dev->badblocks, start, + end - start + 1)) { ret = count; + } out: kfree(orig); return ret; @@ -592,41 +594,43 @@ static ssize_t nullb_device_zone_offline_store(struct config_item *item, CONFIGFS_ATTR_WO(nullb_device_, zone_offline); static struct configfs_attribute *nullb_device_attrs[] = { - &nullb_device_attr_size, + &nullb_device_attr_badblocks, + &nullb_device_attr_badblocks_once, + &nullb_device_attr_badblocks_partial_io, + &nullb_device_attr_blocking, + &nullb_device_attr_blocksize, + &nullb_device_attr_cache_size, &nullb_device_attr_completion_nsec, - &nullb_device_attr_submit_queues, - &nullb_device_attr_poll_queues, + &nullb_device_attr_discard, + &nullb_device_attr_fua, &nullb_device_attr_home_node, - &nullb_device_attr_queue_mode, - &nullb_device_attr_blocksize, - &nullb_device_attr_max_sectors, - &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, - &nullb_device_attr_blocking, - &nullb_device_attr_use_per_node_hctx, - &nullb_device_attr_power, - &nullb_device_attr_memory_backed, - &nullb_device_attr_discard, + &nullb_device_attr_irqmode, + &nullb_device_attr_max_sectors, &nullb_device_attr_mbps, - &nullb_device_attr_cache_size, - &nullb_device_attr_badblocks, - &nullb_device_attr_zoned, - &nullb_device_attr_zone_size, + &nullb_device_attr_memory_backed, + &nullb_device_attr_no_sched, + &nullb_device_attr_poll_queues, + &nullb_device_attr_power, + &nullb_device_attr_queue_mode, + &nullb_device_attr_rotational, + &nullb_device_attr_shared_tag_bitmap, + &nullb_device_attr_shared_tags, + &nullb_device_attr_size, + &nullb_device_attr_submit_queues, + &nullb_device_attr_use_per_node_hctx, + &nullb_device_attr_virt_boundary, + &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_capacity, - &nullb_device_attr_zone_nr_conv, - &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_full, &nullb_device_attr_zone_max_active, - &nullb_device_attr_zone_append_max_sectors, - &nullb_device_attr_zone_readonly, + &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_offline, - &nullb_device_attr_zone_full, - &nullb_device_attr_virt_boundary, - &nullb_device_attr_no_sched, - &nullb_device_attr_shared_tags, - &nullb_device_attr_shared_tag_bitmap, - &nullb_device_attr_fua, - &nullb_device_attr_rotational, + &nullb_device_attr_zone_readonly, + &nullb_device_attr_zone_size, + &nullb_device_attr_zoned, NULL, }; @@ -704,16 +708,28 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, - "badblocks,blocking,blocksize,cache_size,fua," - "completion_nsec,discard,home_node,hw_queue_depth," - "irqmode,max_sectors,mbps,memory_backed,no_sched," - "poll_queues,power,queue_mode,shared_tag_bitmap," - "shared_tags,size,submit_queues,use_per_node_hctx," - "virt_boundary,zoned,zone_capacity,zone_max_active," - "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size,zone_append_max_sectors,zone_full," - "rotational\n"); + + struct configfs_attribute **entry; + char delimiter = ','; + size_t left = PAGE_SIZE; + size_t written = 0; + int ret; + + for (entry = &nullb_device_attrs[0]; *entry && left > 0; entry++) { + if (!*(entry + 1)) + delimiter = '\n'; + ret = snprintf(page + written, left, "%s%c", (*entry)->ca_name, + delimiter); + if (ret >= left) { + WARN_ONCE(1, "Too many null_blk features to print\n"); + memzero_explicit(page, PAGE_SIZE); + return -ENOBUFS; + } + left -= ret; + written += ret; + } + + return written; } CONFIGFS_ATTR_RO(memb_group_, features); @@ -1249,25 +1265,37 @@ static int null_transfer(struct nullb *nullb, struct page *page, return err; } -static blk_status_t null_handle_rq(struct nullb_cmd *cmd) +/* + * Transfer data for the given request. The transfer size is capped with the + * nr_sectors argument. + */ +static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, + sector_t nr_sectors) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); + unsigned int max_bytes = nr_sectors << SECTOR_SHIFT; + unsigned int transferred_bytes = 0; struct req_iterator iter; struct bio_vec bvec; spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; + if (transferred_bytes + len > max_bytes) + len = max_bytes - transferred_bytes; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); if (err) break; sector += len >> SECTOR_SHIFT; + transferred_bytes += len; + if (transferred_bytes >= max_bytes) + break; } spin_unlock_irq(&nullb->lock); @@ -1295,31 +1323,51 @@ static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) return sts; } -static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, - sector_t sector, - sector_t nr_sectors) +/* + * Check if the command should fail for the badblocks. If so, return + * BLK_STS_IOERR and return number of partial I/O sectors to be written or read, + * which may be less than the requested number of sectors. + * + * @cmd: The command to handle. + * @sector: The start sector for I/O. + * @nr_sectors: Specifies number of sectors to write or read, and returns the + * number of sectors to be written or read. + */ +blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, + unsigned int *nr_sectors) { struct badblocks *bb = &cmd->nq->dev->badblocks; - sector_t first_bad; - int bad_sectors; + struct nullb_device *dev = cmd->nq->dev; + unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT; + sector_t first_bad, bad_sectors; + unsigned int partial_io_sectors = 0; - if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) - return BLK_STS_IOERR; + if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_OK; - return BLK_STS_OK; + if (cmd->nq->dev->badblocks_once) + badblocks_clear(bb, first_bad, bad_sectors); + + if (cmd->nq->dev->badblocks_partial_io) { + if (!IS_ALIGNED(first_bad, block_sectors)) + first_bad = ALIGN_DOWN(first_bad, block_sectors); + if (sector < first_bad) + partial_io_sectors = first_bad - sector; + } + *nr_sectors = partial_io_sectors; + + return BLK_STS_IOERR; } -static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_op op, - sector_t sector, - sector_t nr_sectors) +blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); - return null_handle_rq(cmd); + return null_handle_data_transfer(cmd, nr_sectors); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) @@ -1366,18 +1414,19 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; + blk_status_t badblocks_ret = BLK_STS_OK; blk_status_t ret; - if (dev->badblocks.shift != -1) { - ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (dev->badblocks.shift != -1) + badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors); + + if (dev->memory_backed && nr_sectors) { + ret = null_handle_memory_backed(cmd, op, sector, nr_sectors); if (ret != BLK_STS_OK) return ret; } - if (dev->memory_backed) - return null_handle_memory_backed(cmd, op, sector, nr_sectors); - - return BLK_STS_OK; + return badblocks_ret; } static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 6f9fe6171087..7bb6128dbaaf 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -63,6 +63,8 @@ struct nullb_device { unsigned long flags; /* device flags */ unsigned int curr_cache; struct badblocks badblocks; + bool badblocks_once; + bool badblocks_partial_io; unsigned int nr_zones; unsigned int nr_zones_imp_open; @@ -131,6 +133,10 @@ blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors); blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors); +blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, + unsigned int *nr_sectors); +blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors); #ifdef CONFIG_BLK_DEV_ZONED int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 0d5f9bf95229..4e5728f45989 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -353,6 +353,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); struct nullb_zone *zone = &dev->zones[zno]; + blk_status_t badblocks_ret = BLK_STS_OK; blk_status_t ret; trace_nullb_zone_op(cmd, zno, zone->cond); @@ -412,9 +413,20 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, zone->cond = BLK_ZONE_COND_IMP_OPEN; } - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (ret != BLK_STS_OK) - goto unlock_zone; + if (dev->badblocks.shift != -1) { + badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors); + if (badblocks_ret != BLK_STS_OK && !nr_sectors) { + ret = badblocks_ret; + goto unlock_zone; + } + } + + if (dev->memory_backed) { + ret = null_handle_memory_backed(cmd, REQ_OP_WRITE, sector, + nr_sectors); + if (ret != BLK_STS_OK) + goto unlock_zone; + } zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { @@ -429,7 +441,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, zone->cond = BLK_ZONE_COND_FULL; } - ret = BLK_STS_OK; + ret = badblocks_ret; unlock_zone: null_unlock_zone(dev, zone); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 82467ecde7ec..15627417f12e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1010,7 +1010,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, * See queue limits. */ if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES)) - sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl); + sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl); if (sg_cnt == 0) sg_mark_end(&iu->sgt.sgl[0]); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 282f81616a78..2b33fb5b949b 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -485,7 +485,7 @@ static int __send_request(struct request *req) } sg_init_table(sg, port->ring_cookies); - nsg = blk_rq_map_sg(req->q, req, sg); + nsg = blk_rq_map_sg(req, sg); len = 0; for (i = 0; i < nsg; i++) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6d7b26ab434f..c060da409ed8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -73,11 +73,10 @@ /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ - UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ + UBLK_PARAM_TYPE_DMA_ALIGN) struct ublk_rq_data { - struct llist_node node; - struct kref ref; }; @@ -144,8 +143,6 @@ struct ublk_queue { struct task_struct *ubq_daemon; char *io_cmd_buf; - struct llist_head io_cmds; - unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; @@ -494,15 +491,17 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); + +#define UBLK_MAX_UBLKS UBLK_MINORS + /* - * Max ublk devices allowed to add + * Max unprivileged ublk devices allowed to add * * It can be extended to one per-user limit in future or even controlled * by cgroup. */ -#define UBLK_MAX_UBLKS UBLK_MINORS -static unsigned int ublks_max = 64; -static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ +static unsigned int unprivileged_ublks_max = 64; +static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */ static struct miscdevice ublk_misc; @@ -573,6 +572,16 @@ static int ublk_validate_params(const struct ublk_device *ub) else if (ublk_dev_is_zoned(ub)) return -EINVAL; + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { + const struct ublk_param_dma_align *p = &ub->params.dma; + + if (p->alignment >= PAGE_SIZE) + return -EINVAL; + + if (!is_power_of_2(p->alignment + 1)) + return -EINVAL; + } + return 0; } @@ -1100,7 +1109,7 @@ static void ublk_complete_rq(struct kref *ref) } /* - * Since __ublk_rq_task_work always fails requests immediately during + * Since ublk_rq_task_work_cb always fails requests immediately during * exiting, __ublk_fail_req() is only called from abort context during * exiting. So lock is unnecessary. * @@ -1146,11 +1155,14 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_end_request(rq, BLK_STS_IOERR); } -static inline void __ublk_rq_task_work(struct request *req, - unsigned issue_flags) +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) { - struct ublk_queue *ubq = req->mq_hctx->driver_data; - int tag = req->tag; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + int tag = pdu->tag; + struct request *req = blk_mq_tag_to_rq( + ubq->dev->tag_set.tags[ubq->q_id], tag); struct ublk_io *io = &ubq->ios[tag]; unsigned int mapped_bytes; @@ -1225,34 +1237,11 @@ static inline void __ublk_rq_task_work(struct request *req, ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } -static inline void ublk_forward_io_cmds(struct ublk_queue *ubq, - unsigned issue_flags) -{ - struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); - struct ublk_rq_data *data, *tmp; - - io_cmds = llist_reverse_order(io_cmds); - llist_for_each_entry_safe(data, tmp, io_cmds, node) - __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags); -} - -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) -{ - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - struct ublk_queue *ubq = pdu->ubq; - - ublk_forward_io_cmds(ubq, issue_flags); -} - static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + struct ublk_io *io = &ubq->ios[rq->tag]; - if (llist_add(&data->node, &ubq->io_cmds)) { - struct ublk_io *io = &ubq->ios[rq->tag]; - - io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); - } + io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); } static enum blk_eh_timer_return ublk_timeout(struct request *rq) @@ -1445,7 +1434,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) struct request *rq; /* - * Either we fail the request or ublk_rq_task_work_fn + * Either we fail the request or ublk_rq_task_work_cb * will do it */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); @@ -1911,10 +1900,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; out: - io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", __func__, cmd_op, tag, ret, io->flags); - return -EIOCBQUEUED; + return ret; } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, @@ -1970,7 +1958,10 @@ static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, unsigned int issue_flags) { - ublk_ch_uring_cmd_local(cmd, issue_flags); + int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(cmd, ret, 0, issue_flags); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -2235,7 +2226,8 @@ static int ublk_add_chdev(struct ublk_device *ub) if (ret) goto fail; - ublks_added++; + if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) + unprivileged_ublks_added++; return 0; fail: put_device(dev); @@ -2264,11 +2256,16 @@ static int ublk_add_tag_set(struct ublk_device *ub) static void ublk_remove(struct ublk_device *ub) { + bool unprivileged; + ublk_stop_dev(ub); cancel_work_sync(&ub->nosrv_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); + unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; ublk_put_device(ub); - ublks_added--; + + if (unprivileged) + unprivileged_ublks_added--; } static struct ublk_device *ublk_get_device_from_id(int idx) @@ -2343,6 +2340,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) lim.features |= BLK_FEAT_ROTATIONAL; + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) + lim.dma_alignment = ub->params.dma.alignment; + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -2530,7 +2530,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) return ret; ret = -EACCES; - if (ublks_added >= ublks_max) + if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) && + unprivileged_ublks_added >= unprivileged_ublks_max) goto out_unlock; ret = -ENOMEM; @@ -3101,10 +3102,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (ub) ublk_put_device(ub); out: - io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); - return -EIOCBQUEUED; + return ret; } static const struct file_operations ublk_ctl_fops = { @@ -3168,23 +3168,26 @@ static void __exit ublk_exit(void) module_init(ublk_init); module_exit(ublk_exit); -static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp) +static int ublk_set_max_unprivileged_ublks(const char *buf, + const struct kernel_param *kp) { return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); } -static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp) +static int ublk_get_max_unprivileged_ublks(char *buf, + const struct kernel_param *kp) { - return sysfs_emit(buf, "%u\n", ublks_max); + return sysfs_emit(buf, "%u\n", unprivileged_ublks_max); } -static const struct kernel_param_ops ublk_max_ublks_ops = { - .set = ublk_set_max_ublks, - .get = ublk_get_max_ublks, +static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = { + .set = ublk_set_max_unprivileged_ublks, + .get = ublk_get_max_unprivileged_ublks, }; -module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644); -MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); +module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops, + &unprivileged_ublks_max, 0644); +MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)"); MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); MODULE_DESCRIPTION("Userspace block device"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 91cde76a4b3e..7cffea01d868 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -226,7 +226,7 @@ static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, if (unlikely(err)) return -ENOMEM; - return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); + return blk_rq_map_sg(req, vbr->sg_table.sgl); } static void virtblk_cleanup_cmd(struct request *req) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index edcd08a9dcef..5babe575c288 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -751,7 +751,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri id = blkif_ring_get_request(rinfo, req, &final_ring_req); ring_req = &rinfo->shadow[id].req; - num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); + num_sg = blk_rq_map_sg(req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c45464b6576a..c8c1a00e7d80 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4811,23 +4811,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS); if (r) { ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recalc_bios, 1); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0ef5203387b2..453803f1edf5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1081,15 +1081,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->io_bs, pool_size)) - goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->bs, pool_size)) - goto out_free_pools; t->mempools = pools; return 0; @@ -1250,6 +1244,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) profile->max_dun_bytes_supported = UINT_MAX; memset(profile->modes_supported, 0xFF, sizeof(profile->modes_supported)); + profile->key_types_supported = ~0; for (i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 23c09d22fcdb..44ec9b17cfd3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -29,8 +29,10 @@ #include <linux/buffer_head.h> #include <linux/seq_file.h> #include <trace/events/block.h> + #include "md.h" #include "md-bitmap.h" +#include "md-cluster.h" #define BITMAP_MAJOR_LO 3 /* version 4 insists the bitmap is in little-endian order @@ -426,8 +428,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; - unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) << - PAGE_SHIFT; + unsigned long num_pages = bitmap->storage.file_pages; + unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT; loff_t sboff, offset = mddev->bitmap_info.offset; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; @@ -436,7 +438,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; /* we compare length (page numbers), not page offset. */ - if ((pg_index - store->sb_index) == store->file_pages - 1) { + if ((pg_index - store->sb_index) == num_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -942,7 +944,7 @@ out: bmname(bitmap), err); goto out_no_sb; } - bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); + bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev); goto re_read; } @@ -2021,7 +2023,7 @@ static void md_bitmap_free(void *data) sysfs_put(bitmap->sysfs_can_clear); if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && - bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) + bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev)) md_cluster_stop(bitmap->mddev); /* Shouldn't be needed - but just in case.... */ @@ -2229,7 +2231,7 @@ static int bitmap_load(struct mddev *mddev) mddev_create_serial_pool(mddev, rdev); if (mddev_is_clustered(mddev)) - md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); + mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); /* Clear out old bitmap info first: Either there is none, or we * are resuming after someone else has possibly changed things, diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 6595f89becdb..94221d964d4f 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1166,7 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz struct dlm_lock_resource *bm_lockres; char str[64]; - if (i == md_cluster_ops->slot_number(mddev)) + if (i == slot_number(mddev)) continue; bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); @@ -1216,7 +1216,7 @@ out: */ static int cluster_check_sync_size(struct mddev *mddev) { - int current_slot = md_cluster_ops->slot_number(mddev); + int current_slot = slot_number(mddev); int node_num = mddev->bitmap_info.nodes; struct dlm_lock_resource *bm_lockres; struct md_bitmap_stats stats; @@ -1612,7 +1612,14 @@ out: return err; } -static const struct md_cluster_operations cluster_ops = { +static struct md_cluster_operations cluster_ops = { + .head = { + .type = MD_CLUSTER, + .id = ID_CLUSTER, + .name = "cluster", + .owner = THIS_MODULE, + }, + .join = join, .leave = leave, .slot_number = slot_number, @@ -1642,13 +1649,12 @@ static int __init cluster_init(void) { pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); - register_md_cluster_operations(&cluster_ops, THIS_MODULE); - return 0; + return register_md_submodule(&cluster_ops.head); } static void cluster_exit(void) { - unregister_md_cluster_operations(); + unregister_md_submodule(&cluster_ops.head); } module_init(cluster_init); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 470bf18ffde5..8fb06d853173 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -10,6 +10,8 @@ struct mddev; struct md_rdev; struct md_cluster_operations { + struct md_submodule_head head; + int (*join)(struct mddev *mddev, int nodes); int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); @@ -35,4 +37,8 @@ struct md_cluster_operations { void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors); }; +extern int md_setup_cluster(struct mddev *mddev, int nodes); +extern void md_cluster_stop(struct mddev *mddev); +extern void md_reload_sb(struct mddev *mddev, int raid_disk); + #endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 369aed044b40..5d9b08115375 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -5,7 +5,6 @@ */ #include <linux/blkdev.h> -#include <linux/raid/md_u.h> #include <linux/seq_file.h> #include <linux/module.h> #include <linux/slab.h> @@ -320,9 +319,13 @@ static void linear_quiesce(struct mddev *mddev, int state) } static struct md_personality linear_personality = { - .name = "linear", - .level = LEVEL_LINEAR, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_LINEAR, + .name = "linear", + .owner = THIS_MODULE, + }, + .make_request = linear_make_request, .run = linear_run, .free = linear_free, @@ -335,12 +338,12 @@ static struct md_personality linear_personality = { static int __init linear_init(void) { - return register_md_personality(&linear_personality); + return register_md_submodule(&linear_personality.head); } static void linear_exit(void) { - unregister_md_personality(&linear_personality); + unregister_md_submodule(&linear_personality.head); } module_init(linear_init); diff --git a/drivers/md/md.c b/drivers/md/md.c index 30b3dbbce2d2..438e71e45c16 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = { [ACTION_IDLE] = "idle", }; -/* pers_list is a list of registered personalities protected by pers_lock. */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); +static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; -const struct md_cluster_operations *md_cluster_ops; -EXPORT_SYMBOL(md_cluster_ops); -static struct module *md_cluster_mod; - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -629,6 +623,12 @@ static void __mddev_put(struct mddev *mddev) queue_work(md_misc_wq, &mddev->del_work); } +static void mddev_put_locked(struct mddev *mddev) +{ + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) @@ -888,16 +888,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); -static struct md_personality *find_pers(int level, char *clevel) +static struct md_personality *get_pers(int level, char *clevel) { - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; + struct md_personality *ret = NULL; + struct md_submodule_head *head; + unsigned long i; + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_PERSONALITY) + continue; + if ((level != LEVEL_NONE && head->id == level) || + !strcmp(head->name, clevel)) { + if (try_module_get(head->owner)) + ret = (void *)head; + break; + } } - return NULL; + xa_unlock(&md_submodule); + + if (!ret) { + if (level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + level); + else + pr_warn("md: personality for level %s is not loaded!\n", + clevel); + } + + return ret; +} + +static void put_pers(struct md_personality *pers) +{ + module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ @@ -1180,7 +1204,7 @@ int md_check_no_bitmap(struct mddev *mddev) if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); @@ -1748,7 +1772,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (badblocks_set(&rdev->badblocks, sector, count, 1)) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -2359,19 +2383,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); @@ -2639,11 +2650,11 @@ repeat: force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; - ret = md_cluster_ops->metadata_update_start(mddev); + ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); + mddev->cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); @@ -2783,7 +2794,7 @@ rewrite: /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); + mddev->cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), @@ -2942,7 +2953,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else { err = 0; if (mddev_is_clustered(mddev)) - err = md_cluster_ops->remove_disk(mddev, rdev); + err = mddev->cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); @@ -3052,7 +3063,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || - (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } @@ -3860,7 +3871,7 @@ level_show(struct mddev *mddev, char *page) spin_lock(&mddev->lock); p = mddev->pers; if (p) - ret = sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) @@ -3917,7 +3928,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); goto out_unlock; } @@ -3931,24 +3942,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - pr_warn("md: personality %s not loaded\n", clevel); + pers = get_pers(level, clevel); + if (!pers) { rv = -EINVAL; goto out_unlock; } - spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ - module_put(pers->owner); + put_pers(pers); rv = len; goto out_unlock; } if (!pers->takeover) { - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; @@ -3969,7 +3976,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); @@ -3984,7 +3991,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -4026,7 +4033,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } - module_put(oldpers->owner); + put_pers(oldpers); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) @@ -5584,7 +5591,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { - if (mddev->pers == NULL || (mddev->pers->level != 1)) + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); @@ -5610,7 +5617,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; - if (mddev->pers == NULL || (mddev->pers->level != 1)) { + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; @@ -6096,30 +6103,21 @@ int md_run(struct mddev *mddev) goto exit_sync_set; } - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - pr_warn("md: personality for level %d is not loaded!\n", - mddev->level); - else - pr_warn("md: personality for level %s is not loaded!\n", - mddev->clevel); + pers = get_pers(mddev->level, mddev->clevel); + if (!pers) { err = -EINVAL; goto abort; } - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; + if (mddev->level != pers->head.id) { + mddev->level = pers->head.id; + mddev->new_level = pers->head.id; } - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - module_put(pers->owner); + put_pers(pers); err = -EINVAL; goto abort; } @@ -6246,7 +6244,7 @@ bitmap_abort: if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - module_put(pers->owner); + put_pers(pers); mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); @@ -6467,7 +6465,7 @@ static void __md_stop(struct mddev *mddev) mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; - module_put(pers->owner); + put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); bioset_exit(&mddev->bio_set); @@ -6983,7 +6981,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk(mddev, rdev); + err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; @@ -7000,14 +6998,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { - err = md_cluster_ops->new_disk_ack(mddev, - err == 0); + err = mddev->cluster_ops->new_disk_ack( + mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) - md_cluster_ops->add_new_disk_cancel(mddev); + mddev->cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } @@ -7087,10 +7085,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) { - if (md_cluster_ops->remove_disk(mddev, rdev)) - goto busy; - } + if (mddev_is_clustered(mddev) && + mddev->cluster_ops->remove_disk(mddev, rdev)) + goto busy; md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -7393,7 +7390,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -7441,6 +7438,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } +static int get_cluster_ops(struct mddev *mddev) +{ + xa_lock(&md_submodule); + mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); + if (mddev->cluster_ops && + !try_module_get(mddev->cluster_ops->head.owner)) + mddev->cluster_ops = NULL; + xa_unlock(&md_submodule); + + return mddev->cluster_ops == NULL ? -ENOENT : 0; +} + +static void put_cluster_ops(struct mddev *mddev) +{ + if (!mddev->cluster_ops) + return; + + mddev->cluster_ops->leave(mddev); + module_put(mddev->cluster_ops->head.owner); + mddev->cluster_ops = NULL; +} + /* * update_array_info is used to change the configuration of an * on-line array. @@ -7549,16 +7568,15 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ - if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; - md_cluster_ops->unlock_all_bitmaps(mddev); + mddev->cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev->bitmap_ops->destroy(mddev); @@ -7842,7 +7860,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) - md_cluster_ops->new_disk_ack(mddev, false); + mddev->cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; @@ -8124,7 +8142,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) + if (mddev->pers->head.id == ID_RAID0 || + mddev->pers->head.id == ID_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) @@ -8162,14 +8181,17 @@ static void status_unused(struct seq_file *seq) static void status_personalities(struct seq_file *seq) { - struct md_personality *pers; + struct md_submodule_head *head; + unsigned long i; seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - spin_unlock(&pers_lock); + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) + if (head->type == MD_PERSONALITY) + seq_printf(seq, "[%s] ", head->name); + xa_unlock(&md_submodule); + seq_puts(seq, "\n"); } @@ -8392,7 +8414,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); + seq_printf(seq, " %s", mddev->pers->head.name); } else { seq_printf(seq, "inactive"); } @@ -8461,9 +8483,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) status_unused(seq); - if (atomic_dec_and_test(&mddev->active)) - __mddev_put(mddev); - + mddev_put_locked(mddev); return 0; } @@ -8514,67 +8534,34 @@ static const struct proc_ops mdstat_proc_ops = { .proc_poll = mdstat_poll, }; -int register_md_personality(struct md_personality *p) -{ - pr_debug("md: %s personality registered for level %d\n", - p->name, p->level); - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(register_md_personality); - -int unregister_md_personality(struct md_personality *p) +int register_md_submodule(struct md_submodule_head *msh) { - pr_debug("md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; + return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); } -EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL_GPL(register_md_submodule); -int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module) +void unregister_md_submodule(struct md_submodule_head *msh) { - int ret = 0; - spin_lock(&pers_lock); - if (md_cluster_ops != NULL) - ret = -EALREADY; - else { - md_cluster_ops = ops; - md_cluster_mod = module; - } - spin_unlock(&pers_lock); - return ret; + xa_erase(&md_submodule, msh->id); } -EXPORT_SYMBOL(register_md_cluster_operations); - -int unregister_md_cluster_operations(void) -{ - spin_lock(&pers_lock); - md_cluster_ops = NULL; - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(unregister_md_cluster_operations); +EXPORT_SYMBOL_GPL(unregister_md_submodule); int md_setup_cluster(struct mddev *mddev, int nodes) { - int ret; - if (!md_cluster_ops) + int ret = get_cluster_ops(mddev); + + if (ret) { request_module("md-cluster"); - spin_lock(&pers_lock); + ret = get_cluster_ops(mddev); + } + /* ensure module won't be unloaded */ - if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + if (ret) { pr_warn("can't find md-cluster module or get its reference.\n"); - spin_unlock(&pers_lock); - return -ENOENT; + return ret; } - spin_unlock(&pers_lock); - ret = md_cluster_ops->join(mddev, nodes); + ret = mddev->cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; @@ -8582,10 +8569,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) void md_cluster_stop(struct mddev *mddev) { - if (!md_cluster_ops) - return; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); } static int is_mddev_idle(struct mddev *mddev, int init) @@ -8978,7 +8962,7 @@ void md_do_sync(struct md_thread *thread) } if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); + ret = mddev->cluster_ops->resync_start(mddev); if (ret) goto skip; @@ -9005,7 +8989,7 @@ void md_do_sync(struct md_thread *thread) * */ if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start_notify(mddev); + mddev->cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9460,6 +9444,13 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } + /* Check if resync is in progress. */ + if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + /* * Remove any failed drives, then add spares if possible. Spares are * also removed and re-added, to allow the personality to fail the @@ -9476,13 +9467,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } - /* Check if recovery is in progress. */ - if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - return true; - } - /* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) return true; @@ -9789,7 +9773,7 @@ void md_reap_sync_thread(struct mddev *mddev) * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) - md_cluster_ops->resync_finish(mddev); + mddev->cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -9797,13 +9781,13 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); /* - * We call md_cluster_ops->update_size here because sync_size could + * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9841,12 +9825,11 @@ EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ -/* Returns 1 on success, 0 on failure */ -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +/* Returns true on success, false on failure */ +bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { struct mddev *mddev = rdev->mddev; - int rv; /* * Recording new badblocks for faulty rdev will force unnecessary @@ -9856,50 +9839,51 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, * avoid it. */ if (test_bit(Faulty, &rdev->flags)) - return 1; + return true; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { - /* Make sure they get written out promptly */ - if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->sb_flags, 0, - BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); - md_wakeup_thread(rdev->mddev->thread); - return 1; - } else - return 0; + + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + return false; + + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return true; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_clear(&rdev->badblocks, s, sectors); - if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + + if (!badblocks_clear(&rdev->badblocks, s, sectors)) + return; + + if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct mddev *mddev, *n; + struct mddev *mddev; int need_delay = 0; spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -9911,8 +9895,8 @@ static int md_notify_reboot(struct notifier_block *this, mddev_unlock(mddev); } need_delay = 1; - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); @@ -10029,7 +10013,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && - !md_cluster_ops->resync_status_get(mddev)) { + !mddev->cluster_ops->resync_status_get(mddev)) { /* * -1 to make raid1_add_disk() set conf->fullsync * to 1. This could avoid skipping sync when the @@ -10245,7 +10229,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev, *n; + struct mddev *mddev; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -10266,7 +10250,7 @@ static __exit void md_exit(void) remove_proc_entry("mdstat", NULL); spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -10278,8 +10262,8 @@ static __exit void md_exit(void) * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); diff --git a/drivers/md/md.h b/drivers/md/md.h index def808064ad8..1cf00a04bcdd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -18,11 +18,37 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/raid/md_u.h> #include <trace/events/block.h> -#include "md-cluster.h" #define MaxSector (~(sector_t)0) +enum md_submodule_type { + MD_PERSONALITY = 0, + MD_CLUSTER, + MD_BITMAP, /* TODO */ +}; + +enum md_submodule_id { + ID_LINEAR = LEVEL_LINEAR, + ID_RAID0 = 0, + ID_RAID1 = 1, + ID_RAID4 = 4, + ID_RAID5 = 5, + ID_RAID6 = 6, + ID_RAID10 = 10, + ID_CLUSTER, + ID_BITMAP, /* TODO */ + ID_LLBITMAP, /* TODO */ +}; + +struct md_submodule_head { + enum md_submodule_type type; + enum md_submodule_id id; + const char *name; + struct module *owner; +}; + /* * These flags should really be called "NO_RETRY" rather than * "FAILFAST" because they don't make any promise about time lapse, @@ -266,8 +292,8 @@ enum flag_bits { Nonrot, /* non-rotational device (SSD) */ }; -static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, @@ -284,16 +310,17 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } -extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new); -extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new); +extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); +extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); struct md_cluster_info; +struct md_cluster_operations; /** * enum mddev_flags - md device flags. @@ -576,6 +603,7 @@ struct mddev { mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; + struct md_cluster_operations *cluster_ops; unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ @@ -699,10 +727,8 @@ static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) struct md_personality { - char *name; - int level; - struct list_head list; - struct module *owner; + struct md_submodule_head head; + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that @@ -843,13 +869,9 @@ static inline void safe_put_page(struct page *p) if (p) put_page(p); } -extern int register_md_personality(struct md_personality *p); -extern int unregister_md_personality(struct md_personality *p); -extern int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module); -extern int unregister_md_cluster_operations(void); -extern int md_setup_cluster(struct mddev *mddev, int nodes); -extern void md_cluster_stop(struct mddev *mddev); +int register_md_submodule(struct md_submodule_head *msh); +void unregister_md_submodule(struct md_submodule_head *msh); + extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, @@ -906,7 +928,6 @@ extern void md_idle_sync_thread(struct mddev *mddev); extern void md_frozen_sync_thread(struct mddev *mddev); extern void md_unfrozen_sync_thread(struct mddev *mddev); -extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); extern void mddev_destroy_serial_pool(struct mddev *mddev, @@ -928,7 +949,6 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) } } -extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 70bcc3cdf2cd..d8f639f4ae12 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -809,9 +809,13 @@ static void raid0_quiesce(struct mddev *mddev, int quiesce) static struct md_personality raid0_personality= { - .name = "raid0", - .level = 0, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID0, + .name = "raid0", + .owner = THIS_MODULE, + }, + .make_request = raid0_make_request, .run = raid0_run, .free = raid0_free, @@ -822,14 +826,14 @@ static struct md_personality raid0_personality= .error_handler = raid0_error, }; -static int __init raid0_init (void) +static int __init raid0_init(void) { - return register_md_personality (&raid0_personality); + return register_md_submodule(&raid0_personality.head); } -static void raid0_exit (void) +static void __exit raid0_exit(void) { - unregister_md_personality (&raid0_personality); + unregister_md_submodule(&raid0_personality.head); } module_init(raid0_init); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 4378d3250bd7..c7efd8aab675 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, sector_t this_sector, int *len) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; /* no bad block overlap */ if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) @@ -287,8 +287,8 @@ static inline bool raid1_should_read_first(struct mddev *mddev, return true; if (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, READ, this_sector, - this_sector + len)) + mddev->cluster_ops->area_resyncing(mddev, READ, this_sector, + this_sector + len)) return true; return false; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 10ea3af40991..0efc03cea24e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -36,6 +36,7 @@ #include "md.h" #include "raid1.h" #include "md-bitmap.h" +#include "md-cluster.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ @@ -45,6 +46,7 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); +static void raid1_free(struct mddev *mddev, void *priv); #define RAID_1_10_NAME "raid1" #include "raid1-10.c" @@ -1315,8 +1317,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct bio *read_bio; - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; int rdisk, error; bool r1bio_existed = !!r1_bio; @@ -1404,7 +1404,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &mirror->rdev->flags) && test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -1467,7 +1466,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bool is_discard = (bio_op(bio) == REQ_OP_DISCARD); if (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, WRITE, + mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) { DEFINE_WAIT(w); @@ -1478,7 +1477,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; @@ -1537,7 +1536,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, @@ -1653,8 +1652,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); mbio->bi_end_io = raid1_end_write_request; - mbio->bi_opf = bio_op(bio) | - (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC)); if (test_bit(FailFast, &rdev->flags) && !test_bit(WriteMostly, &rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -2486,7 +2483,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) } } -static int narrow_write_error(struct r1bio *r1_bio, int i) +static bool narrow_write_error(struct r1bio *r1_bio, int i) { struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2507,10 +2504,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) sector_t sector; int sectors; int sect_to_write = r1_bio->sectors; - int ok = 1; + bool ok = true; if (rdev->badblocks.shift < 0) - return 0; + return false; block_sectors = roundup(1 << rdev->badblocks.shift, bdev_logical_block_size(rdev->bdev) >> 9); @@ -2886,7 +2883,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { /* may need to read from here */ sector_t first_bad = MaxSector; - int bad_sectors; + sector_t bad_sectors; if (is_badblock(rdev, sector_nr, good_sectors, &first_bad, &bad_sectors)) { @@ -3038,9 +3035,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, conf->cluster_sync_low = mddev->curr_resync_completed; conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; /* Send resync message */ - md_cluster_ops->resync_info_update(mddev, - conf->cluster_sync_low, - conf->cluster_sync_high); + mddev->cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); } /* For a user-requested sync, we read all readable devices and do a @@ -3256,8 +3253,11 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); - if (ret) + if (ret) { + if (!mddev->private) + raid1_free(mddev, conf); return ret; + } } mddev->degraded = 0; @@ -3271,6 +3271,8 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); + if (!mddev->private) + raid1_free(mddev, conf); return -EINVAL; } @@ -3491,9 +3493,13 @@ static void *raid1_takeover(struct mddev *mddev) static struct md_personality raid1_personality = { - .name = "raid1", - .level = 1, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID1, + .name = "raid1", + .owner = THIS_MODULE, + }, + .make_request = raid1_make_request, .run = raid1_run, .free = raid1_free, @@ -3510,18 +3516,18 @@ static struct md_personality raid1_personality = .takeover = raid1_takeover, }; -static int __init raid_init(void) +static int __init raid1_init(void) { - return register_md_personality(&raid1_personality); + return register_md_submodule(&raid1_personality.head); } -static void raid_exit(void) +static void __exit raid1_exit(void) { - unregister_md_personality(&raid1_personality); + unregister_md_submodule(&raid1_personality.head); } -module_init(raid_init); -module_exit(raid_exit); +module_init(raid1_init); +module_exit(raid1_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 15b9ae5bf84d..846c5f29486e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -24,6 +24,7 @@ #include "raid10.h" #include "raid0.h" #include "md-bitmap.h" +#include "md-cluster.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -747,7 +748,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, for (slot = 0; slot < conf->copies ; slot++) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; sector_t dev_sector; unsigned int pending; bool nonrot; @@ -1146,8 +1147,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, { struct r10conf *conf = mddev->private; struct bio *read_bio; - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; struct md_rdev *rdev; char b[BDEVNAME_SIZE]; @@ -1228,7 +1227,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &rdev->flags) && test_bit(R10BIO_FailFast, &r10_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -1247,10 +1245,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, struct bio *bio, bool replacement, int n_copy) { - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; - const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; - const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC; unsigned long flags; struct r10conf *conf = mddev->private; struct md_rdev *rdev; @@ -1269,7 +1263,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); mbio->bi_end_io = raid10_end_write_request; - mbio->bi_opf = op | do_sync | do_fua | do_atomic; if (!replacement && test_bit(FailFast, &conf->mirrors[devnum].rdev->flags) && enough(conf, devnum)) @@ -1355,9 +1348,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int error; if ((mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) { + mddev->cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { DEFINE_WAIT(w); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { @@ -1367,7 +1360,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; schedule(); @@ -1438,7 +1431,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, dev_sector, max_sectors, @@ -1631,11 +1624,10 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return -EAGAIN; - if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { + if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { bio_wouldblock_error(bio); return 0; } - wait_barrier(conf, false); /* * Check reshape again to avoid reshape happens after checking @@ -2786,7 +2778,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } } -static int narrow_write_error(struct r10bio *r10_bio, int i) +static bool narrow_write_error(struct r10bio *r10_bio, int i) { struct bio *bio = r10_bio->master_bio; struct mddev *mddev = r10_bio->mddev; @@ -2807,10 +2799,10 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) sector_t sector; int sectors; int sect_to_write = r10_bio->sectors; - int ok = 1; + bool ok = true; if (rdev->badblocks.shift < 0) - return 0; + return false; block_sectors = roundup(1 << rdev->badblocks.shift, bdev_logical_block_size(rdev->bdev) >> 9); @@ -3413,7 +3405,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t from_addr, to_addr; struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t sector, first_bad; - int bad_sectors; + sector_t bad_sectors; if (!rdev || !test_bit(In_sync, &rdev->flags)) continue; @@ -3609,7 +3601,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; - int bad_sectors; + sector_t bad_sectors; struct md_rdev *rdev; if (r10_bio->devs[i].repl_bio) @@ -3716,7 +3708,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, conf->cluster_sync_low = mddev->curr_resync_completed; raid10_set_cluster_sync_high(conf); /* Send resync message */ - md_cluster_ops->resync_info_update(mddev, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -3749,7 +3741,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } if (broadcast_msg) { raid10_set_cluster_sync_high(conf); - md_cluster_ops->resync_info_update(mddev, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -4541,7 +4533,7 @@ static int raid10_start_reshape(struct mddev *mddev) if (ret) goto abort; - ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); + ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { mddev->bitmap_ops->resize(mddev, oldsize, 0, false); goto abort; @@ -4832,7 +4824,7 @@ read_more: conf->cluster_sync_low = sb_reshape_pos; } - md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -4977,7 +4969,7 @@ static void raid10_update_reshape_pos(struct mddev *mddev) struct r10conf *conf = mddev->private; sector_t lo, hi; - md_cluster_ops->resync_info_get(mddev, &lo, &hi); + mddev->cluster_ops->resync_info_get(mddev, &lo, &hi); if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) || mddev->reshape_position == MaxSector) conf->reshape_progress = mddev->reshape_position; @@ -5123,9 +5115,13 @@ static void raid10_finish_reshape(struct mddev *mddev) static struct md_personality raid10_personality = { - .name = "raid10", - .level = 10, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID10, + .name = "raid10", + .owner = THIS_MODULE, + }, + .make_request = raid10_make_request, .run = raid10_run, .free = raid10_free, @@ -5145,18 +5141,18 @@ static struct md_personality raid10_personality = .update_reshape_pos = raid10_update_reshape_pos, }; -static int __init raid_init(void) +static int __init raid10_init(void) { - return register_md_personality(&raid10_personality); + return register_md_submodule(&raid10_personality.head); } -static void raid_exit(void) +static void __exit raid10_exit(void) { - unregister_md_personality(&raid10_personality); + unregister_md_submodule(&raid10_personality.head); } -module_init(raid_init); -module_exit(raid_exit); +module_init(raid10_init); +module_exit(raid10_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5c79429acc64..6389383166c0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5858,6 +5858,9 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev, struct r5conf *conf, sector_t logical_sector) { sector_t reshape_progress, reshape_safe; + + if (likely(conf->reshape_progress == MaxSector)) + return LOC_NO_RESHAPE; /* * Spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be @@ -5935,22 +5938,19 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, const int rw = bio_data_dir(bi); enum stripe_result ret; struct stripe_head *sh; + enum reshape_loc loc; sector_t new_sector; int previous = 0, flags = 0; int seq, dd_idx; seq = read_seqcount_begin(&conf->gen_lock); - - if (unlikely(conf->reshape_progress != MaxSector)) { - enum reshape_loc loc = get_reshape_loc(mddev, conf, - logical_sector); - if (loc == LOC_INSIDE_RESHAPE) { - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; - } - if (loc == LOC_AHEAD_OF_RESHAPE) - previous = 1; + loc = get_reshape_loc(mddev, conf, logical_sector); + if (loc == LOC_INSIDE_RESHAPE) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } + if (loc == LOC_AHEAD_OF_RESHAPE) + previous = 1; new_sector = raid5_compute_sector(conf, logical_sector, previous, &dd_idx, NULL); @@ -6127,7 +6127,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && - (conf->reshape_progress != MaxSector) && get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { bio_wouldblock_error(bi); if (rw == WRITE) @@ -8954,9 +8953,13 @@ static void raid5_prepare_suspend(struct mddev *mddev) static struct md_personality raid6_personality = { - .name = "raid6", - .level = 6, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID6, + .name = "raid6", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -8980,9 +8983,13 @@ static struct md_personality raid6_personality = }; static struct md_personality raid5_personality = { - .name = "raid5", - .level = 5, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID5, + .name = "raid5", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -9007,9 +9014,13 @@ static struct md_personality raid5_personality = static struct md_personality raid4_personality = { - .name = "raid4", - .level = 4, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID4, + .name = "raid4", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -9045,21 +9056,39 @@ static int __init raid5_init(void) "md/raid5:prepare", raid456_cpu_up_prepare, raid456_cpu_dead); - if (ret) { - destroy_workqueue(raid5_wq); - return ret; - } - register_md_personality(&raid6_personality); - register_md_personality(&raid5_personality); - register_md_personality(&raid4_personality); + if (ret) + goto err_destroy_wq; + + ret = register_md_submodule(&raid6_personality.head); + if (ret) + goto err_cpuhp_remove; + + ret = register_md_submodule(&raid5_personality.head); + if (ret) + goto err_unregister_raid6; + + ret = register_md_submodule(&raid4_personality.head); + if (ret) + goto err_unregister_raid5; + return 0; + +err_unregister_raid5: + unregister_md_submodule(&raid5_personality.head); +err_unregister_raid6: + unregister_md_submodule(&raid6_personality.head); +err_cpuhp_remove: + cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); +err_destroy_wq: + destroy_workqueue(raid5_wq); + return ret; } -static void raid5_exit(void) +static void __exit raid5_exit(void) { - unregister_md_personality(&raid6_personality); - unregister_md_personality(&raid5_personality); - unregister_md_personality(&raid4_personality); + unregister_md_submodule(&raid6_personality.head); + unregister_md_submodule(&raid5_personality.head); + unregister_md_submodule(&raid4_personality.head); cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); destroy_workqueue(raid5_wq); } diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 5b617c1f6789..f4398383ae06 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1904,7 +1904,7 @@ static void msb_io_work(struct work_struct *work) /* process the request */ dbg_verbose("IO: processing new request"); - blk_rq_map_sg(msb->queue, req, sg); + blk_rq_map_sg(req, sg); lba = blk_rq_pos(req); diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 634d343b6bdb..c9853d887d28 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -627,9 +627,7 @@ static int mspro_block_issue_req(struct memstick_dev *card) while (true) { msb->current_page = 0; msb->current_seg = 0; - msb->seg_count = blk_rq_map_sg(msb->block_req->q, - msb->block_req, - msb->req_sg); + msb->seg_count = blk_rq_map_sg(msb->block_req, msb->req_sg); if (!msb->seg_count) { unsigned int bytes = blk_rq_cur_bytes(msb->block_req); diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index ab662f502fe7..3ba62f825b84 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -523,5 +523,5 @@ unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq) { struct request *req = mmc_queue_req_to_req(mqrq); - return blk_rq_map_sg(mq->queue, req, mqrq->sg); + return blk_rq_map_sg(req, mqrq->sg); } diff --git a/drivers/mmc/host/cqhci-crypto.c b/drivers/mmc/host/cqhci-crypto.c index cb8044093402..5a467098a0d6 100644 --- a/drivers/mmc/host/cqhci-crypto.c +++ b/drivers/mmc/host/cqhci-crypto.c @@ -84,11 +84,11 @@ static int cqhci_crypto_keyslot_program(struct blk_crypto_profile *profile, if (ccap_array[cap_idx].algorithm_id == CQHCI_CRYPTO_ALG_AES_XTS) { /* In XTS mode, the blk_crypto_key's size is already doubled */ - memcpy(cfg.crypto_key, key->raw, key->size/2); + memcpy(cfg.crypto_key, key->bytes, key->size/2); memcpy(cfg.crypto_key + CQHCI_CRYPTO_KEY_MAX_SIZE/2, - key->raw + key->size/2, key->size/2); + key->bytes + key->size/2, key->size/2); } else { - memcpy(cfg.crypto_key, key->raw, key->size); + memcpy(cfg.crypto_key, key->bytes, key->size); } cqhci_crypto_program_key(cq_host, &cfg, slot); @@ -204,6 +204,8 @@ int cqhci_crypto_init(struct cqhci_host *cq_host) /* Unfortunately, CQHCI crypto only supports 32 DUN bits. */ profile->max_dun_bytes_supported = 4; + profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; + /* * Cache all the crypto capabilities and advertise the supported crypto * modes and data unit sizes to the block layer. diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index e3d39311fdc7..3c383bce4928 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1895,6 +1895,7 @@ static int sdhci_msm_ice_init(struct sdhci_msm_host *msm_host, profile->ll_ops = sdhci_msm_crypto_ops; profile->max_dun_bytes_supported = 4; + profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; profile->dev = dev; /* @@ -1968,7 +1969,7 @@ static int sdhci_msm_ice_keyslot_program(struct blk_crypto_profile *profile, return qcom_ice_program_key(msm_host->ice, QCOM_ICE_CRYPTO_ALG_AES_XTS, QCOM_ICE_CRYPTO_KEY_SIZE_256, - key->raw, + key->bytes, key->crypto_cfg.data_unit_size / 512, slot); } diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 2836905f0152..39cc0a6a4d37 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -199,7 +199,7 @@ static blk_status_t ubiblock_read(struct request *req) * and ubi_read_sg() will check that limit. */ ubi_sgl_init(&pdu->usgl); - blk_rq_map_sg(req->q, req, pdu->usgl.sg); + blk_rq_map_sg(req, pdu->usgl.sg); while (bytes_left) { /* diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c index a002ea6fdd84..ee478ccde7c6 100644 --- a/drivers/nvdimm/badrange.c +++ b/drivers/nvdimm/badrange.c @@ -167,7 +167,7 @@ static void set_badblock(struct badblocks *bb, sector_t s, int num) dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n", (u64) s * 512, (u64) num * 512); /* this isn't an error as the hardware will still throw an exception */ - if (badblocks_set(bb, s, num, 1)) + if (!badblocks_set(bb, s, num, 1)) dev_info_once(bb->dev, "%s: failed for sector %llx\n", __func__, (u64) s); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 5ca06e9a2d29..cc5c8f3f81e8 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -673,7 +673,7 @@ static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, { if (bb->count) { sector_t first_bad; - int num_bad; + sector_t num_bad; return !!badblocks_check(bb, sector, len / 512, &first_bad, &num_bad); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index cfdfe0eaa512..8f3e816e805d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -367,9 +367,10 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) struct nd_namespace_common *ndns = nd_pfn->ndns; void *zero_page = page_address(ZERO_PAGE(0)); struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; - int num_bad, meta_num, rc, bb_present; + int meta_num, rc, bb_present; sector_t first_bad, meta_start; struct nd_namespace_io *nsio; + sector_t num_bad; if (nd_pfn->mode != PFN_MODE_PMEM) return 0; @@ -394,7 +395,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) bb_present = badblocks_check(&nd_region->bb, meta_start, meta_num, &first_bad, &num_bad); if (bb_present) { - dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n", + dev_dbg(&nd_pfn->dev, "meta: %llx badblocks at %llx\n", num_bad, first_bad); nsoff = ALIGN_DOWN((nd_region->ndr_start + (first_bad << 9)) - nsio->res.start, @@ -413,7 +414,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) } if (rc) { dev_err(&nd_pfn->dev, - "error clearing %x badblocks at %llx\n", + "error clearing %llx badblocks at %llx\n", num_bad, first_bad); return rc; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d81faa9d89c9..43156e1576c9 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -249,7 +249,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT; struct badblocks *bb = &pmem->bb; sector_t first_bad; - int num_bad; + sector_t num_bad; if (kaddr) *kaddr = pmem->virt_addr + offset; diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index 244432e0b73d..da963e4f3f1f 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -12,3 +12,4 @@ config NVME_AUTH select CRYPTO_SHA512 select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS + select CRYPTO_HKDF diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 9b7126e1a19d..2c092ec8c0a9 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -11,9 +11,12 @@ #include <linux/unaligned.h> #include <crypto/hash.h> #include <crypto/dh.h> +#include <crypto/hkdf.h> #include <linux/nvme.h> #include <linux/nvme-auth.h> +#define HKDF_MAX_HASHLEN 64 + static u32 nvme_dhchap_seqnum; static DEFINE_MUTEX(nvme_dhchap_mutex); @@ -471,5 +474,339 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) } EXPORT_SYMBOL_GPL(nvme_auth_generate_key); +/** + * nvme_auth_generate_psk - Generate a PSK for TLS + * @hmac_id: Hash function identifier + * @skey: Session key + * @skey_len: Length of @skey + * @c1: Value of challenge C1 + * @c2: Value of challenge C2 + * @hash_len: Hash length of the hash algorithm + * @ret_psk: Pointer too the resulting generated PSK + * @ret_len: length of @ret_psk + * + * Generate a PSK for TLS as specified in NVMe base specification, section + * 8.13.5.9: Generated PSK for TLS + * + * The generated PSK for TLS shall be computed applying the HMAC function + * using the hash function H( ) selected by the HashID parameter in the + * DH-HMAC-CHAP_Challenge message with the session key KS as key to the + * concatenation of the two challenges C1 and C2 (i.e., generated + * PSK = HMAC(KS, C1 || C2)). + * + * Returns 0 on success with a valid generated PSK pointer in @ret_psk and + * the length of @ret_psk in @ret_len, or a negative error number otherwise. + */ +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *psk; + const char *hmac_name; + int ret, psk_len; + + if (!c1 || !c2) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + psk_len = crypto_shash_digestsize(tfm); + psk = kzalloc(psk_len, GFP_KERNEL); + if (!psk) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, skey, skey_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c1, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c2, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_final(shash, psk); + if (!ret) { + *ret_psk = psk; + *ret_len = psk_len; + } + +out_free_psk: + if (ret) + kfree_sensitive(psk); +out_free_tfm: + crypto_free_shash(tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); + +/** + * nvme_auth_generate_digest - Generate TLS PSK digest + * @hmac_id: Hash function identifier + * @psk: Generated input PSK + * @psk_len: Length of @psk + * @subsysnqn: NQN of the subsystem + * @hostnqn: NQN of the host + * @ret_digest: Pointer to the returned digest + * + * Generate a TLS PSK digest as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The PSK digest shall be computed by encoding in Base64 (refer to RFC + * 4648) the result of the application of the HMAC function using the hash + * function specified in item 4 above (ie the hash function of the cipher + * suite associated with the PSK identity) with the PSK as HMAC key to the + * concatenation of: + * - the NQN of the host (i.e., NQNh) not including the null terminator; + * - a space character; + * - the NQN of the NVM subsystem (i.e., NQNc) not including the null + * terminator; + * - a space character; and + * - the seventeen ASCII characters "NVMe-over-Fabrics" + * (i.e., <PSK digest> = Base64(HMAC(PSK, NQNh || " " || NQNc || " " || + * "NVMe-over-Fabrics"))). + * The length of the PSK digest depends on the hash function used to compute + * it as follows: + * - If the SHA-256 hash function is used, the resulting PSK digest is 44 + * characters long; or + * - If the SHA-384 hash function is used, the resulting PSK digest is 64 + * characters long. + * + * Returns 0 on success with a valid digest pointer in @ret_digest, or a + * negative error number on failure. + */ +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *digest, *enc; + const char *hmac_name; + size_t digest_len, hmac_len; + int ret; + + if (WARN_ON(!subsysnqn || !hostnqn)) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + switch (nvme_auth_hmac_hash_len(hmac_id)) { + case 32: + hmac_len = 44; + break; + case 48: + hmac_len = 64; + break; + default: + pr_warn("%s: invalid hash algorithm '%s'\n", + __func__, hmac_name); + return -EINVAL; + } + + enc = kzalloc(hmac_len + 1, GFP_KERNEL); + if (!enc) + return -ENOMEM; + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out_free_enc; + } + + digest_len = crypto_shash_digestsize(tfm); + digest = kzalloc(digest_len, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, psk, psk_len); + if (ret) + goto out_free_digest; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " ", 1); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18); + if (ret) + goto out_free_digest; + + ret = crypto_shash_final(shash, digest); + if (ret) + goto out_free_digest; + + ret = base64_encode(digest, digest_len, enc); + if (ret < hmac_len) { + ret = -ENOKEY; + goto out_free_digest; + } + *ret_digest = enc; + ret = 0; + +out_free_digest: + kfree_sensitive(digest); +out_free_tfm: + crypto_free_shash(tfm); +out_free_enc: + if (ret) + kfree_sensitive(enc); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); + +/** + * nvme_auth_derive_tls_psk - Derive TLS PSK + * @hmac_id: Hash function identifier + * @psk: generated input PSK + * @psk_len: size of @psk + * @psk_digest: TLS PSK digest + * @ret_psk: Pointer to the resulting TLS PSK + * + * Derive a TLS PSK as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The TLS PSK shall be derived as follows from an input PSK + * (i.e., either a retained PSK or a generated PSK) and a PSK + * identity using the HKDF-Extract and HKDF-Expand-Label operations + * (refer to RFC 5869 and RFC 8446) where the hash function is the + * one specified by the hash specifier of the PSK identity: + * 1. PRK = HKDF-Extract(0, Input PSK); and + * 2. TLS PSK = HKDF-Expand-Label(PRK, "nvme-tls-psk", PskIdentityContext, L), + * where PskIdentityContext is the hash identifier indicated in + * the PSK identity concatenated to a space character and to the + * Base64 PSK digest (i.e., "<hash> <PSK digest>") and L is the + * output size in bytes of the hash function (i.e., 32 for SHA-256 + * and 48 for SHA-384). + * + * Returns 0 on success with a valid psk pointer in @ret_psk or a negative + * error number otherwise. + */ +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk) +{ + struct crypto_shash *hmac_tfm; + const char *hmac_name; + const char *psk_prefix = "tls13 nvme-tls-psk"; + static const char default_salt[HKDF_MAX_HASHLEN]; + size_t info_len, prk_len; + char *info; + unsigned char *prk, *tls_key; + int ret; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + if (hmac_id == NVME_AUTH_HASH_SHA512) { + pr_warn("%s: unsupported hash algorithm %s\n", + __func__, hmac_name); + return -EINVAL; + } + + hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(hmac_tfm)) + return PTR_ERR(hmac_tfm); + + prk_len = crypto_shash_digestsize(hmac_tfm); + prk = kzalloc(prk_len, GFP_KERNEL); + if (!prk) { + ret = -ENOMEM; + goto out_free_shash; + } + + if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) { + ret = -EINVAL; + goto out_free_prk; + } + ret = hkdf_extract(hmac_tfm, psk, psk_len, + default_salt, prk_len, prk); + if (ret) + goto out_free_prk; + + ret = crypto_shash_setkey(hmac_tfm, prk, prk_len); + if (ret) + goto out_free_prk; + + /* + * 2 addtional bytes for the length field from HDKF-Expand-Label, + * 2 addtional bytes for the HMAC ID, and one byte for the space + * separator. + */ + info_len = strlen(psk_digest) + strlen(psk_prefix) + 5; + info = kzalloc(info_len + 1, GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto out_free_prk; + } + + put_unaligned_be16(psk_len, info); + memcpy(info + 2, psk_prefix, strlen(psk_prefix)); + sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest); + + tls_key = kzalloc(psk_len, GFP_KERNEL); + if (!tls_key) { + ret = -ENOMEM; + goto out_free_info; + } + ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len); + if (ret) { + kfree(tls_key); + goto out_free_info; + } + *ret_psk = tls_key; + +out_free_info: + kfree(info); +out_free_prk: + kfree(prk); +out_free_shash: + crypto_free_shash(hmac_tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index ed5167f942d8..32d16c53133b 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -5,7 +5,6 @@ #include <linux/module.h> #include <linux/seq_file.h> -#include <linux/key.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/nvme.h> @@ -124,6 +123,70 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, return key_ref_to_ptr(keyref); } +/** + * nvme_tls_psk_refresh - Refresh TLS PSK + * @keyring: Keyring holding the TLS PSK + * @hostnqn: Host NQN to use + * @subnqn: Subsystem NQN to use + * @hmac_id: Hash function identifier + * @data: TLS PSK key material + * @data_len: Length of @data + * @digest: TLS PSK digest + * + * Refresh a generated version 1 TLS PSK with the identity generated + * from @hmac_id, @hostnqn, @subnqn, and @digest in the keyring given + * by @keyring. + * + * Returns the updated key success or an error pointer otherwise. + */ +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + key_perm_t keyperm = + KEY_POS_SEARCH | KEY_POS_VIEW | KEY_POS_READ | + KEY_POS_WRITE | KEY_POS_LINK | KEY_POS_SETATTR | + KEY_USR_SEARCH | KEY_USR_VIEW | KEY_USR_READ; + char *identity; + key_ref_t keyref; + key_serial_t keyring_id; + struct key *key; + + if (!hostnqn || !subnqn || !data || !data_len) + return ERR_PTR(-EINVAL); + + identity = kasprintf(GFP_KERNEL, "NVMe1G%02d %s %s %s", + hmac_id, hostnqn, subnqn, digest); + if (!identity) + return ERR_PTR(-ENOMEM); + + if (!keyring) + keyring = nvme_keyring; + keyring_id = key_serial(keyring); + pr_debug("keyring %x refresh tls psk '%s'\n", + keyring_id, identity); + keyref = key_create_or_update(make_key_ref(keyring, true), + "psk", identity, data, data_len, + keyperm, KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(keyref)) { + pr_debug("refresh tls psk '%s' failed, error %ld\n", + identity, PTR_ERR(keyref)); + kfree(identity); + return ERR_PTR(-ENOKEY); + } + kfree(identity); + /* + * Set the default timeout to 1 hour + * as suggested in TP8018. + */ + key = key_ref_to_ptr(keyref); + key_set_timeout(key, 3600); + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_psk_refresh); + /* * NVMe PSK priority list * diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 486afe598184..10e453b2436e 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -109,7 +109,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH - select NVME_KEYRING if NVME_TCP_TLS + select NVME_KEYRING help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 8971aca41e63..57e863ecde58 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -525,7 +525,7 @@ static blk_status_t apple_nvme_map_data(struct apple_nvme *anv, if (!iod->sg) return BLK_STS_RESOURCE; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + iod->nents = blk_rq_map_sg(req, iod->sg); if (!iod->nents) goto out_free_sg; diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 5ea0e21709da..6115fef74c1e 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -12,6 +12,7 @@ #include "nvme.h" #include "fabrics.h" #include <linux/nvme-auth.h> +#include <linux/nvme-keyring.h> #define CHAP_BUF_SIZE 4096 static struct kmem_cache *nvme_chap_buf_cache; @@ -131,7 +132,13 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, data->auth_type = NVME_AUTH_COMMON_MESSAGES; data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; data->t_id = cpu_to_le16(chap->transaction); - data->sc_c = 0; /* No secure channel concatenation */ + if (ctrl->opts->concat && chap->qid == 0) { + if (ctrl->opts->tls_key) + data->sc_c = NVME_AUTH_SECP_REPLACETLSPSK; + else + data->sc_c = NVME_AUTH_SECP_NEWTLSPSK; + } else + data->sc_c = NVME_AUTH_SECP_NOSC; data->napd = 1; data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; data->auth_protocol[0].dhchap.halen = 3; @@ -311,8 +318,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, data->hl = chap->hash_len; data->dhvlen = cpu_to_le16(chap->host_key_len); memcpy(data->rval, chap->response, chap->hash_len); - if (ctrl->ctrl_key) { + if (ctrl->ctrl_key) chap->bi_directional = true; + if (ctrl->ctrl_key || ctrl->opts->concat) { get_random_bytes(chap->c2, chap->hash_len); data->cvalid = 1; memcpy(data->rval + chap->hash_len, chap->c2, @@ -322,7 +330,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, } else { memset(chap->c2, 0, chap->hash_len); } - chap->s2 = nvme_auth_get_seqnum(); + if (ctrl->opts->concat) + chap->s2 = 0; + else + chap->s2 = nvme_auth_get_seqnum(); data->seqnum = cpu_to_le32(chap->s2); if (chap->host_key_len) { dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n", @@ -677,6 +688,92 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_kpp(chap->dh_tfm); } +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) +{ + dev_dbg(ctrl->device, "Wipe generated TLS PSK %08x\n", + key_serial(ctrl->opts->tls_key)); + key_revoke(ctrl->opts->tls_key); + key_put(ctrl->opts->tls_key); + ctrl->opts->tls_key = NULL; +} +EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key); + +static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + u8 *psk, *digest, *tls_psk; + struct key *tls_key; + size_t psk_len; + int ret = 0; + + if (!chap->sess_key) { + dev_warn(ctrl->device, + "%s: qid %d no session key negotiated\n", + __func__, chap->qid); + return -ENOKEY; + } + + if (chap->qid) { + dev_warn(ctrl->device, + "qid %d: secure concatenation not supported on I/O queues\n", + chap->qid); + return -EINVAL; + } + ret = nvme_auth_generate_psk(chap->hash_id, chap->sess_key, + chap->sess_key_len, + chap->c1, chap->c2, + chap->hash_len, &psk, &psk_len); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate PSK, error %d\n", + __func__, chap->qid, ret); + return ret; + } + dev_dbg(ctrl->device, + "%s: generated psk %*ph\n", __func__, (int)psk_len, psk); + + ret = nvme_auth_generate_digest(chap->hash_id, psk, psk_len, + ctrl->opts->subsysnqn, + ctrl->opts->host->nqn, &digest); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate digest, error %d\n", + __func__, chap->qid, ret); + goto out_free_psk; + }; + dev_dbg(ctrl->device, "%s: generated digest %s\n", + __func__, digest); + ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len, + digest, &tls_psk); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to derive TLS psk, error %d\n", + __func__, chap->qid, ret); + goto out_free_digest; + }; + + tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring, + ctrl->opts->host->nqn, + ctrl->opts->subsysnqn, chap->hash_id, + tls_psk, psk_len, digest); + if (IS_ERR(tls_key)) { + ret = PTR_ERR(tls_key); + dev_warn(ctrl->device, + "%s: qid %d failed to insert generated key, error %d\n", + __func__, chap->qid, ret); + tls_key = NULL; + } + kfree_sensitive(tls_psk); + if (ctrl->opts->tls_key) + nvme_auth_revoke_tls_key(ctrl); + ctrl->opts->tls_key = tls_key; +out_free_digest: + kfree_sensitive(digest); +out_free_psk: + kfree_sensitive(psk); + return ret; +} + static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = @@ -833,6 +930,13 @@ static void nvme_queue_auth_work(struct work_struct *work) } if (!ret) { chap->error = 0; + if (ctrl->opts->concat && + (ret = nvme_auth_secure_concat(ctrl, chap))) { + dev_warn(ctrl->device, + "%s: qid %d failed to enable secure concatenation\n", + __func__, chap->qid); + chap->error = ret; + } return; } @@ -912,6 +1016,11 @@ static void nvme_ctrl_auth_work(struct work_struct *work) "qid 0: authentication failed\n"); return; } + /* + * Only run authentication on the admin queue for secure concatenation. + */ + if (ctrl->opts->concat) + return; for (q = 1; q < ctrl->queue_count; q++) { ret = nvme_auth_negotiate(ctrl, q); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8359d0aa0e44..777db89fdaa7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4018,6 +4018,9 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (!nvme_ns_head_multipath(ns->head)) nvme_cdev_del(&ns->cdev, &ns->cdev_device); + + nvme_mpath_remove_sysfs_link(ns); + del_gendisk(ns->disk); mutex_lock(&ns->ctrl->namespaces_lock); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 432efcbf9e2f..93e9041b9657 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -472,8 +472,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) result = le32_to_cpu(res.u32); ctrl->cntlid = result & 0xFFFF; if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { - /* Secure concatenation is not implemented */ - if (result & NVME_CONNECT_AUTHREQ_ASCR) { + /* Check for secure concatenation */ + if ((result & NVME_CONNECT_AUTHREQ_ASCR) && + !ctrl->opts->concat) { dev_warn(ctrl->device, "qid 0: secure concatenation is not supported\n"); ret = -EOPNOTSUPP; @@ -550,7 +551,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) /* Secure concatenation is not implemented */ if (result & NVME_CONNECT_AUTHREQ_ASCR) { dev_warn(ctrl->device, - "qid 0: secure concatenation is not supported\n"); + "qid %d: secure concatenation is not supported\n", qid); ret = -EOPNOTSUPP; goto out_free_data; } @@ -706,6 +707,7 @@ static const match_table_t opt_tokens = { #endif #ifdef CONFIG_NVME_TCP_TLS { NVMF_OPT_TLS, "tls" }, + { NVMF_OPT_CONCAT, "concat" }, #endif { NVMF_OPT_ERR, NULL } }; @@ -735,6 +737,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->tls = false; opts->tls_key = NULL; opts->keyring = NULL; + opts->concat = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -1053,6 +1056,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->tls = true; break; + case NVMF_OPT_CONCAT: + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) { + pr_err("TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + opts->concat = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -1079,6 +1090,23 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", opts->fast_io_fail_tmo, ctrl_loss_tmo); } + if (opts->concat) { + if (opts->tls) { + pr_err("Secure concatenation over TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + if (opts->tls_key) { + pr_err("Cannot specify a TLS key for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + if (!opts->dhchap_secret) { + pr_err("Need to enable DH-CHAP for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + } opts->host = nvmf_host_add(hostnqn, &hostid); if (IS_ERR(opts->host)) { diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 21d75dc4a3a0..9cf5b020adba 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -66,6 +66,7 @@ enum { NVMF_OPT_TLS = 1 << 25, NVMF_OPT_KEYRING = 1 << 26, NVMF_OPT_TLS_KEY = 1 << 27, + NVMF_OPT_CONCAT = 1 << 28, }; /** @@ -101,6 +102,7 @@ enum { * @keyring: Keyring to use for key lookups * @tls_key: TLS key for encrypted connections (TCP) * @tls: Start TLS encrypted connections (TCP) + * @concat: Enabled Secure channel concatenation (TCP) * @disable_sqflow: disable controller sq flow control * @hdr_digest: generate/verify header digest (TCP) * @data_digest: generate/verify data digest (TCP) @@ -130,6 +132,7 @@ struct nvmf_ctrl_options { struct key *keyring; struct key *tls_key; bool tls; + bool concat; bool disable_sqflow; bool hdr_digest; bool data_digest; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b9929a5a7f4e..2257c3c96dd2 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2571,7 +2571,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, if (ret) return -ENOMEM; - op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); + op->nents = blk_rq_map_sg(rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, rq_dma_dir(rq)); @@ -2858,7 +2858,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) unsigned int nr_io_queues; int ret; - nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(), ctrl->lport->ops->max_hw_queues); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { @@ -2912,7 +2912,7 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) unsigned int nr_io_queues; int ret; - nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(), ctrl->lport->ops->max_hw_queues); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 2a7635565083..6b12ca80aa27 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -686,6 +686,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) kblockd_schedule_work(&head->partition_scan_work); } + nvme_mpath_add_sysfs_link(ns->head); + mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { int node, srcu_idx; @@ -768,6 +770,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, if (nvme_state_is_live(ns->ana_state) && nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE) nvme_mpath_set_live(ns); + else { + /* + * Add sysfs link from multipath head gendisk node to path + * device gendisk node. + * If path's ana state is live (i.e. state is either optimized + * or non-optimized) while we alloc the ns then sysfs link would + * be created from nvme_mpath_set_live(). In that case we would + * not fallthrough this code path. However for the path's ana + * state other than live, we call nvme_mpath_set_live() only + * after ana state transitioned to the live state. But we still + * want to create the sysfs link from head node to a path device + * irrespctive of the path's ana state. + * If we reach through here then it means that path's ana state + * is not live but still create the sysfs link to this path from + * head node if head node of the path has already come alive. + */ + if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags)) + nvme_mpath_add_sysfs_link(ns->head); + } } static int nvme_update_ana_state(struct nvme_ctrl *ctrl, @@ -955,6 +976,45 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, } DEVICE_ATTR_RO(ana_state); +static ssize_t queue_depth_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD) + return 0; + + return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active)); +} +DEVICE_ATTR_RO(queue_depth); + +static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + int node, srcu_idx; + nodemask_t numa_nodes; + struct nvme_ns *current_ns; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_head *head = ns->head; + + if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA) + return 0; + + nodes_clear(numa_nodes); + + srcu_idx = srcu_read_lock(&head->srcu); + for_each_node(node) { + current_ns = srcu_dereference(head->current_path[node], + &head->srcu); + if (ns == current_ns) + node_set(node, numa_nodes); + } + srcu_read_unlock(&head->srcu, srcu_idx); + + return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes)); +} +DEVICE_ATTR_RO(numa_nodes); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { @@ -967,6 +1027,84 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, return -ENXIO; /* just break out of the loop */ } +void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head) +{ + struct device *target; + int rc, srcu_idx; + struct nvme_ns *ns; + struct kobject *kobj; + + /* + * Ensure head disk node is already added otherwise we may get invalid + * kobj for head disk node + */ + if (!test_bit(GD_ADDED, &head->disk->state)) + return; + + kobj = &disk_to_dev(head->disk)->kobj; + + /* + * loop through each ns chained through the head->list and create the + * sysfs link from head node to the ns path node + */ + srcu_idx = srcu_read_lock(&head->srcu); + + list_for_each_entry_rcu(ns, &head->list, siblings) { + /* + * Avoid creating link if it already exists for the given path. + * When path ana state transitions from optimized to non- + * optimized or vice-versa, the nvme_mpath_set_live() is + * invoked which in truns call this function. Now if the sysfs + * link already exists for the given path and we attempt to re- + * create the link then sysfs code would warn about it loudly. + * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure + * that we're not creating duplicate link. + * The test_and_set_bit() is used because it is protecting + * against multiple nvme paths being simultaneously added. + */ + if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags)) + continue; + + /* + * Ensure that ns path disk node is already added otherwise we + * may get invalid kobj name for target + */ + if (!test_bit(GD_ADDED, &ns->disk->state)) + continue; + + target = disk_to_dev(ns->disk); + /* + * Create sysfs link from head gendisk kobject @kobj to the + * ns path gendisk kobject @target->kobj. + */ + rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name, + &target->kobj, dev_name(target)); + if (unlikely(rc)) { + dev_err(disk_to_dev(ns->head->disk), + "failed to create link to %s\n", + dev_name(target)); + clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags); + } + } + + srcu_read_unlock(&head->srcu, srcu_idx); +} + +void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns) +{ + struct device *target; + struct kobject *kobj; + + if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags)) + return; + + target = disk_to_dev(ns->disk); + kobj = &disk_to_dev(ns->head->disk)->kobj; + sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name, + dev_name(target)); + clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags); +} + void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { if (nvme_ctrl_use_ana(ns->ctrl)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7be92d07430e..51e078642127 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -534,10 +534,11 @@ struct nvme_ns { struct nvme_ns_head *head; unsigned long flags; -#define NVME_NS_REMOVING 0 -#define NVME_NS_ANA_PENDING 2 -#define NVME_NS_FORCE_RO 3 -#define NVME_NS_READY 4 +#define NVME_NS_REMOVING 0 +#define NVME_NS_ANA_PENDING 2 +#define NVME_NS_FORCE_RO 3 +#define NVME_NS_READY 4 +#define NVME_NS_SYSFS_ATTR_LINK 5 struct cdev cdev; struct device cdev_device; @@ -933,6 +934,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_attr_groups[]; +extern const struct attribute_group nvme_ns_mpath_attr_group; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; extern const struct attribute_group nvme_dev_attrs_group; @@ -955,6 +957,8 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); +void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns); +void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns); void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid); void nvme_mpath_remove_disk(struct nvme_ns_head *head); int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); @@ -980,6 +984,8 @@ static inline void nvme_trace_bio_complete(struct request *req) extern bool multipath; extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; +extern struct device_attribute dev_attr_queue_depth; +extern struct device_attribute dev_attr_numa_nodes; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) @@ -1009,6 +1015,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } +static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns) +{ +} +static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns) +{ +} static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) { return false; @@ -1147,6 +1159,7 @@ void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); void nvme_auth_free(struct nvme_ctrl *ctrl); +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl); #else static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { @@ -1169,6 +1182,7 @@ static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) return -EPROTONOSUPPORT; } static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; +static inline void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) {}; #endif u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3ad7f197c808..2883d17ee1eb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -812,7 +812,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (!iod->sgt.sgl) return BLK_STS_RESOURCE; sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); + iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl); if (!iod->sgt.orig_nents) goto out_free_sg; @@ -953,9 +953,6 @@ out_free_cmd: return ret; } -/* - * NOTE: ns is NULL when called on the admin queue. - */ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 86a2891d9bcc..b5a0295b5bf4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1476,8 +1476,7 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, if (ret) return -ENOMEM; - req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, - req->data_sgl.sg_table.sgl); + req->data_sgl.nents = blk_rq_map_sg(rq, req->data_sgl.sg_table.sgl); *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3a41b9ab0f13..6d31226f7a4f 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -258,6 +258,8 @@ static struct attribute *nvme_ns_attrs[] = { #ifdef CONFIG_NVME_MULTIPATH &dev_attr_ana_grpid.attr, &dev_attr_ana_state.attr, + &dev_attr_queue_depth.attr, + &dev_attr_numa_nodes.attr, #endif &dev_attr_io_passthru_err_log_enabled.attr, NULL, @@ -290,6 +292,10 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; } + if (a == &dev_attr_queue_depth.attr || a == &dev_attr_numa_nodes.attr) { + if (nvme_disk_is_ns_head(dev_to_disk(dev))) + return 0; + } #endif return a->mode; } @@ -299,8 +305,22 @@ static const struct attribute_group nvme_ns_attr_group = { .is_visible = nvme_ns_attrs_are_visible, }; +#ifdef CONFIG_NVME_MULTIPATH +static struct attribute *nvme_ns_mpath_attrs[] = { + NULL, +}; + +const struct attribute_group nvme_ns_mpath_attr_group = { + .name = "multipath", + .attrs = nvme_ns_mpath_attrs, +}; +#endif + const struct attribute_group *nvme_ns_attr_groups[] = { &nvme_ns_attr_group, +#ifdef CONFIG_NVME_MULTIPATH + &nvme_ns_mpath_attr_group, +#endif NULL, }; @@ -780,10 +800,10 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_tls_key.attr && - !ctrl->opts->tls) + !ctrl->opts->tls && !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_configured_key.attr && - !ctrl->opts->tls_key) + (!ctrl->opts->tls_key || ctrl->opts->concat)) return 0; if (a == &dev_attr_tls_keyring.attr && !ctrl->opts->keyring) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 327f3f2f5399..26c459f0198d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,7 +8,6 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/key.h> #include <linux/nvme-tcp.h> #include <linux/nvme-keyring.h> #include <net/sock.h> @@ -249,7 +248,7 @@ static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0; - return ctrl->opts->tls; + return ctrl->opts->tls || ctrl->opts->concat; } static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) @@ -1790,7 +1789,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; - ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, + ret = sock_create_kern(current->nsproxy->net_ns, + ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { dev_err(nctrl->device, @@ -2060,7 +2060,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) if (nvme_tcp_tls_configured(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); - else { + else if (ctrl->opts->tls) { pskid = nvme_tls_psk_default(ctrl->opts->keyring, ctrl->opts->host->nqn, ctrl->opts->subsysnqn); @@ -2090,9 +2090,25 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { - dev_err(ctrl->device, "no PSK negotiated\n"); - return -ENOKEY; + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->concat) { + /* + * The generated PSK is stored in the + * fabric options + */ + if (!ctrl->opts->tls_key) { + dev_err(ctrl->device, "no PSK generated\n"); + return -ENOKEY; + } + if (ctrl->tls_pskid && + ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) { + dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } + } else if (!ctrl->tls_pskid) { + dev_err(ctrl->device, "no PSK negotiated\n"); + return -ENOKEY; + } } for (i = 1; i < ctrl->queue_count; i++) { @@ -2310,6 +2326,27 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, } } +/* + * The TLS key is set by secure concatenation after negotiation has been + * completed on the admin queue. We need to revoke the key when: + * - concatenation is enabled (otherwise it's a static key set by the user) + * and + * - the generated key is present in ctrl->tls_key (otherwise there's nothing + * to revoke) + * and + * - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS + * negotiation has not run). + * + * We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called + * twice during secure concatenation, once on a 'normal' connection to run the + * DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set), + * and once after the negotiation (which uses the key, so it _must_ be set). + */ +static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid; +} + static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; @@ -2319,6 +2356,16 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (ret) return ret; + if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) { + /* See comments for nvme_tcp_key_revoke_needed() */ + dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n"); + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + ret = nvme_tcp_configure_admin_queue(ctrl, false); + if (ret) + return ret; + } + if (ctrl->icdoff) { ret = -EOPNOTSUPP; dev_err(ctrl->device, "icdoff is not supported!\n"); @@ -2415,6 +2462,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); @@ -2455,6 +2504,8 @@ static void nvme_reset_ctrl_work(struct work_struct *work) container_of(work, struct nvme_ctrl, reset_work); int ret; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_ctrl(ctrl); nvme_tcp_teardown_ctrl(ctrl, false); @@ -2951,7 +3002,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | - NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY, + NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 382949e18c6a..cce4c5b55aa9 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -146,17 +146,16 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, return NULL; } -static int nvme_zone_parse_entry(struct nvme_ctrl *ctrl, - struct nvme_ns_head *head, +static int nvme_zone_parse_entry(struct nvme_ns *ns, struct nvme_zone_descriptor *entry, unsigned int idx, report_zones_cb cb, void *data) { + struct nvme_ns_head *head = ns->head; struct blk_zone zone = { }; if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { - dev_err(ctrl->device, "invalid zone type %#x\n", - entry->zt); + dev_err(ns->ctrl->device, "invalid zone type %#x\n", entry->zt); return -EINVAL; } @@ -213,8 +212,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, break; for (i = 0; i < nz && zone_idx < nr_zones; i++) { - ret = nvme_zone_parse_entry(ns->ctrl, ns->head, - &report->entries[i], + ret = nvme_zone_parse_entry(ns, &report->entries[i], zone_idx, cb, data); if (ret) goto out_free; diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index b47d675232d2..0b0645ac5df4 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -15,6 +15,7 @@ #include <linux/ctype.h> #include <linux/random.h> #include <linux/nvme-auth.h> +#include <linux/nvme-keyring.h> #include <linux/unaligned.h> #include "nvmet.h" @@ -139,7 +140,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) return ret; } -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { int ret = 0; struct nvmet_host_link *p; @@ -165,6 +166,11 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) goto out_unlock; } + if (nvmet_queue_tls_keyid(sq)) { + pr_debug("host %s tls enabled\n", ctrl->hostnqn); + goto out_unlock; + } + ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id); if (ret < 0) { pr_warn("Failed to setup DH group"); @@ -233,6 +239,9 @@ out_unlock: void nvmet_auth_sq_free(struct nvmet_sq *sq) { cancel_delayed_work(&sq->auth_expired_work); +#ifdef CONFIG_NVME_TARGET_TCP_TLS + sq->tls_key = 0; +#endif kfree(sq->dhchap_c1); sq->dhchap_c1 = NULL; kfree(sq->dhchap_c2); @@ -261,6 +270,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) nvme_auth_free_key(ctrl->ctrl_key); ctrl->ctrl_key = NULL; } +#ifdef CONFIG_NVME_TARGET_TCP_TLS + if (ctrl->tls_key) { + key_put(ctrl->tls_key); + ctrl->tls_key = NULL; + } +#endif } bool nvmet_check_auth_status(struct nvmet_req *req) @@ -542,3 +557,58 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, return ret; } + +void nvmet_auth_insert_psk(struct nvmet_sq *sq) +{ + int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id); + u8 *psk, *digest, *tls_psk; + size_t psk_len; + int ret; +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key = NULL; +#endif + + ret = nvme_auth_generate_psk(sq->ctrl->shash_id, + sq->dhchap_skey, + sq->dhchap_skey_len, + sq->dhchap_c1, sq->dhchap_c2, + hash_len, &psk, &psk_len); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to generate PSK, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + return; + } + ret = nvme_auth_generate_digest(sq->ctrl->shash_id, psk, psk_len, + sq->ctrl->subsysnqn, + sq->ctrl->hostnqn, &digest); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to generate digest, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + goto out_free_psk; + } + ret = nvme_auth_derive_tls_psk(sq->ctrl->shash_id, psk, psk_len, + digest, &tls_psk); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to derive TLS PSK, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + goto out_free_digest; + } +#ifdef CONFIG_NVME_TARGET_TCP_TLS + tls_key = nvme_tls_psk_refresh(NULL, sq->ctrl->hostnqn, sq->ctrl->subsysnqn, + sq->ctrl->shash_id, tls_psk, psk_len, digest); + if (IS_ERR(tls_key)) { + pr_warn("%s: ctrl %d qid %d failed to refresh key, error %ld\n", + __func__, sq->ctrl->cntlid, sq->qid, PTR_ERR(tls_key)); + tls_key = NULL; + kfree_sensitive(tls_psk); + } + if (sq->ctrl->tls_key) + key_put(sq->ctrl->tls_key); + sq->ctrl->tls_key = tls_key; +#endif + +out_free_digest: + kfree_sensitive(digest); +out_free_psk: + kfree_sensitive(psk); +} diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2e741696f371..71f8d06998d6 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1618,8 +1618,6 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) } ctrl->cntlid = ret; - uuid_copy(&ctrl->hostid, args->hostid); - /* * Discovery controllers may use some arbitrary high value * in order to cleanup stale discovery sessions @@ -1647,7 +1645,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) if (args->hostid) uuid_copy(&ctrl->hostid, args->hostid); - dhchap_status = nvmet_setup_auth(ctrl); + dhchap_status = nvmet_setup_auth(ctrl, args->sq); if (dhchap_status) { pr_err("Failed to setup authentication, dhchap status %u\n", dhchap_status); @@ -1662,11 +1660,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) args->status = NVME_SC_SUCCESS; - pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s.\n", + pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s%s.\n", nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, ctrl->pi_support ? " T10-PI is enabled" : "", - nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); + nvmet_has_auth(ctrl, args->sq) ? " with DH-HMAC-CHAP" : "", + nvmet_queue_tls_keyid(args->sq) ? ", TLS" : ""); return ctrl; diff --git a/drivers/nvme/target/debugfs.c b/drivers/nvme/target/debugfs.c index 220c7391fc19..e4300eb95101 100644 --- a/drivers/nvme/target/debugfs.c +++ b/drivers/nvme/target/debugfs.c @@ -132,6 +132,27 @@ static int nvmet_ctrl_host_traddr_show(struct seq_file *m, void *p) } NVMET_DEBUGFS_ATTR(nvmet_ctrl_host_traddr); +#ifdef CONFIG_NVME_TARGET_TCP_TLS +static int nvmet_ctrl_tls_key_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + key_serial_t keyid = nvmet_queue_tls_keyid(ctrl->sqs[0]); + + seq_printf(m, "%08x\n", keyid); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_key); + +static int nvmet_ctrl_tls_concat_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + + seq_printf(m, "%d\n", ctrl->concat); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_concat); +#endif + int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) { char name[32]; @@ -157,6 +178,12 @@ int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) &nvmet_ctrl_state_fops); debugfs_create_file("host_traddr", S_IRUSR, ctrl->debugfs_dir, ctrl, &nvmet_ctrl_host_traddr_fops); +#ifdef CONFIG_NVME_TARGET_TCP_TLS + debugfs_create_file("tls_concat", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_tls_concat_fops); + debugfs_create_file("tls_key", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_tls_key_fops); +#endif return 0; } diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 2022757f08dc..bf01ec414c55 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -43,8 +43,26 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) data->auth_protocol[0].dhchap.halen, data->auth_protocol[0].dhchap.dhlen); req->sq->dhchap_tid = le16_to_cpu(data->t_id); - if (data->sc_c) - return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + if (data->sc_c != NVME_AUTH_SECP_NOSC) { + if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + /* Secure concatenation can only be enabled on the admin queue */ + if (req->sq->qid) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + switch (data->sc_c) { + case NVME_AUTH_SECP_NEWTLSPSK: + if (nvmet_queue_tls_keyid(req->sq)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + break; + case NVME_AUTH_SECP_REPLACETLSPSK: + if (!nvmet_queue_tls_keyid(req->sq)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + break; + default: + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + } + ctrl->concat = true; + } if (data->napd != 1) return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; @@ -103,6 +121,12 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) nvme_auth_dhgroup_name(fallback_dhgid)); ctrl->dh_gid = fallback_dhgid; } + if (ctrl->dh_gid == NVME_AUTH_DHGROUP_NULL && ctrl->concat) { + pr_debug("%s: ctrl %d qid %d: NULL DH group invalid " + "for secure channel concatenation\n", __func__, + ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + } pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n", __func__, ctrl->cntlid, req->sq->qid, nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid); @@ -148,12 +172,22 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d) if (memcmp(data->rval, response, data->hl)) { pr_info("ctrl %d qid %d host response mismatch\n", ctrl->cntlid, req->sq->qid); + pr_debug("ctrl %d qid %d rval %*ph\n", + ctrl->cntlid, req->sq->qid, data->hl, data->rval); + pr_debug("ctrl %d qid %d response %*ph\n", + ctrl->cntlid, req->sq->qid, data->hl, response); kfree(response); return NVME_AUTH_DHCHAP_FAILURE_FAILED; } kfree(response); pr_debug("%s: ctrl %d qid %d host authenticated\n", __func__, ctrl->cntlid, req->sq->qid); + if (!data->cvalid && ctrl->concat) { + pr_debug("%s: ctrl %d qid %d invalid challenge\n", + __func__, ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + } + req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); if (data->cvalid) { req->sq->dhchap_c2 = kmemdup(data->rval + data->hl, data->hl, GFP_KERNEL); @@ -163,11 +197,23 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d) pr_debug("%s: ctrl %d qid %d challenge %*ph\n", __func__, ctrl->cntlid, req->sq->qid, data->hl, req->sq->dhchap_c2); - } else { + } + /* + * NVMe Base Spec 2.2 section 8.3.4.5.4: DH-HMAC-CHAP_Reply message + * Sequence Number (SEQNUM): [ .. ] + * The value 0h is used to indicate that bidirectional authentication + * is not performed, but a challenge value C2 is carried in order to + * generate a pre-shared key (PSK) for subsequent establishment of a + * secure channel. + */ + if (req->sq->dhchap_s2 == 0) { + if (ctrl->concat) + nvmet_auth_insert_psk(req->sq); req->sq->authenticated = true; + kfree(req->sq->dhchap_c2); req->sq->dhchap_c2 = NULL; - } - req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); + } else if (!data->cvalid) + req->sq->authenticated = true; return 0; } @@ -246,7 +292,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req) pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__, ctrl->cntlid, req->sq->qid); if (!req->sq->qid) { - dhchap_status = nvmet_setup_auth(ctrl); + dhchap_status = nvmet_setup_auth(ctrl, req->sq); if (dhchap_status) { pr_err("ctrl %d qid 0 failed to setup re-authentication\n", ctrl->cntlid); @@ -303,6 +349,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req) } goto done_kfree; case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2: + if (ctrl->concat) + nvmet_auth_insert_psk(req->sq); req->sq->authenticated = true; pr_debug("%s: ctrl %d qid %d ctrl authenticated\n", __func__, ctrl->cntlid, req->sq->qid); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index eb406c90c167..f012bdf89850 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -234,10 +234,26 @@ err: return ret; } -static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl) +static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { + bool needs_auth = nvmet_has_auth(ctrl, sq); + key_serial_t keyid = nvmet_queue_tls_keyid(sq); + + /* Do not authenticate I/O queues for secure concatenation */ + if (ctrl->concat && sq->qid) + needs_auth = false; + + if (keyid) + pr_debug("%s: ctrl %d qid %d should %sauthenticate, tls psk %08x\n", + __func__, ctrl->cntlid, sq->qid, + needs_auth ? "" : "not ", keyid); + else + pr_debug("%s: ctrl %d qid %d should %sauthenticate%s\n", + __func__, ctrl->cntlid, sq->qid, + needs_auth ? "" : "not ", + ctrl->concat ? ", secure concatenation" : ""); return (u32)ctrl->cntlid | - (nvmet_has_auth(ctrl) ? NVME_CONNECT_AUTHREQ_ATR : 0); + (needs_auth ? NVME_CONNECT_AUTHREQ_ATR : 0); } static void nvmet_execute_admin_connect(struct nvmet_req *req) @@ -247,6 +263,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) struct nvmet_ctrl *ctrl = NULL; struct nvmet_alloc_ctrl_args args = { .port = req->port, + .sq = req->sq, .ops = req->ops, .p2p_client = req->p2p_client, .kato = le32_to_cpu(c->kato), @@ -299,7 +316,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) goto out; } - args.result = cpu_to_le32(nvmet_connect_result(ctrl)); + args.result = cpu_to_le32(nvmet_connect_result(ctrl, req->sq)); out: kfree(d); complete: @@ -357,7 +374,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out_ctrl_put; pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); - req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); + req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl, req->sq)); out: kfree(d); complete: diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 3ef4beacde32..7318b736d414 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -172,20 +172,6 @@ struct nvmet_fc_tgt_assoc { struct work_struct del_work; }; - -static inline int -nvmet_fc_iodnum(struct nvmet_fc_ls_iod *iodptr) -{ - return (iodptr - iodptr->tgtport->iod); -} - -static inline int -nvmet_fc_fodnum(struct nvmet_fc_fcp_iod *fodptr) -{ - return (fodptr - fodptr->queue->fod); -} - - /* * Association and Connection IDs: * diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a9d112d34d4f..a5c41144667c 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -162,7 +162,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, } iod->req.sg = iod->sg_table.sgl; - iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + iod->req.sg_cnt = blk_rq_map_sg(req, iod->sg_table.sgl); iod->req.transfer_len = blk_rq_payload_bytes(req); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index fcf4f460dc9a..b6db8b74dc4a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -165,6 +165,9 @@ struct nvmet_sq { u8 *dhchap_skey; int dhchap_skey_len; #endif +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key; +#endif struct completion free_done; struct completion confirm_done; }; @@ -289,6 +292,7 @@ struct nvmet_ctrl { u64 err_counter; struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; bool pi_support; + bool concat; #ifdef CONFIG_NVME_TARGET_AUTH struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; @@ -298,6 +302,9 @@ struct nvmet_ctrl { u8 *dh_key; size_t dh_keysize; #endif +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key; +#endif struct nvmet_pr_log_mgr pr_log_mgr; }; @@ -583,6 +590,7 @@ void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); struct nvmet_alloc_ctrl_args { struct nvmet_port *port; + struct nvmet_sq *sq; char *subsysnqn; char *hostnqn; uuid_t *hostid; @@ -819,7 +827,7 @@ static inline u8 nvmet_cc_iocqes(u32 cc) /* Convert a 32-bit number to a 16-bit 0's based number */ static inline __le16 to0based(u32 a) { - return cpu_to_le16(max(1U, min(1U << 16, a)) - 1); + return cpu_to_le16(clamp(a, 1U, 1U << 16) - 1); } static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns) @@ -851,6 +859,22 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) bio_put(bio); } +#ifdef CONFIG_NVME_TARGET_TCP_TLS +static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq) +{ + return sq->tls_key ? key_serial(sq->tls_key) : 0; +} +static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq) +{ + if (sq->tls_key) { + key_put(sq->tls_key); + sq->tls_key = NULL; + } +} +#else +static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq) { return 0; } +static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq) {} +#endif #ifdef CONFIG_NVME_TARGET_AUTH u32 nvmet_auth_send_data_len(struct nvmet_req *req); void nvmet_execute_auth_send(struct nvmet_req *req); @@ -859,7 +883,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req); int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, bool set_ctrl); int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl); +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq); void nvmet_auth_sq_init(struct nvmet_sq *sq); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_auth_sq_free(struct nvmet_sq *sq); @@ -869,16 +893,18 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int hash_len); int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, unsigned int hash_len); -static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { - return ctrl->host_key != NULL; + return ctrl->host_key != NULL && !nvmet_queue_tls_keyid(sq); } int nvmet_auth_ctrl_exponential(struct nvmet_req *req, u8 *buf, int buf_size); int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, u8 *buf, int buf_size); +void nvmet_auth_insert_psk(struct nvmet_sq *sq); #else -static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) +static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, + struct nvmet_sq *sq) { return 0; } @@ -891,11 +917,13 @@ static inline bool nvmet_check_auth_status(struct nvmet_req *req) { return true; } -static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, + struct nvmet_sq *sq) { return false; } static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; } +static inline void nvmet_auth_insert_psk(struct nvmet_sq *sq) {}; #endif int nvmet_pr_init_ns(struct nvmet_ns *ns); diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index b1e31483f157..b54b3fdbe389 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1385,7 +1385,6 @@ static u16 nvmet_pci_epf_delete_sq(struct nvmet_ctrl *tctrl, u16 sqid) if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; - flush_workqueue(sq->iod_wq); destroy_workqueue(sq->iod_wq); sq->iod_wq = NULL; @@ -2129,8 +2128,15 @@ static int nvmet_pci_epf_configure_bar(struct nvmet_pci_epf *nvme_epf) return -ENODEV; } - if (epc_features->bar[BAR_0].only_64bit) - epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; + /* + * While NVMe PCIe Transport Specification 1.1, section 2.1.10, claims + * that the BAR0 type is Implementation Specific, in NVMe 1.1, the type + * is required to be 64-bit. Thus, for interoperability, always set the + * type to 64-bit. In the rare case that the PCI EPC does not support + * configuring BAR0 as 64-bit, the call to pci_epc_set_bar() will fail, + * and we will return failure back to the user. + */ + epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; /* * Calculate the size of the register bar: NVMe registers first with diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4f9cac8a5abe..f2d0c920269b 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -8,7 +8,6 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/key.h> #include <linux/nvme-tcp.h> #include <linux/nvme-keyring.h> #include <net/sock.h> @@ -1080,10 +1079,11 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, &queue->nvme_sq, &nvmet_tcp_ops))) { - pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", + pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n", req->cmd, req->cmd->common.command_id, req->cmd->common.opcode, - le32_to_cpu(req->cmd->common.dptr.sgl.length)); + le32_to_cpu(req->cmd->common.dptr.sgl.length), + le16_to_cpu(req->cqe->status)); nvmet_tcp_handle_req_failure(queue, queue->cmd, req); return 0; @@ -1609,6 +1609,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) /* stop accepting incoming data */ queue->rcv_state = NVMET_TCP_RECV_ERR; + nvmet_sq_put_tls_key(&queue->nvme_sq); nvmet_tcp_uninit_data_in_cmds(queue); nvmet_sq_destroy(&queue->nvme_sq); cancel_work_sync(&queue->io_work); @@ -1794,6 +1795,27 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) return 0; } +static int nvmet_tcp_tls_key_lookup(struct nvmet_tcp_queue *queue, + key_serial_t peerid) +{ + struct key *tls_key = nvme_tls_key_lookup(peerid); + int status = 0; + + if (IS_ERR(tls_key)) { + pr_warn("%s: queue %d failed to lookup key %x\n", + __func__, queue->idx, peerid); + spin_lock_bh(&queue->state_lock); + queue->state = NVMET_TCP_Q_FAILED; + spin_unlock_bh(&queue->state_lock); + status = PTR_ERR(tls_key); + } else { + pr_debug("%s: queue %d using TLS PSK %x\n", + __func__, queue->idx, peerid); + queue->nvme_sq.tls_key = tls_key; + } + return status; +} + static void nvmet_tcp_tls_handshake_done(void *data, int status, key_serial_t peerid) { @@ -1814,6 +1836,10 @@ static void nvmet_tcp_tls_handshake_done(void *data, int status, spin_unlock_bh(&queue->state_lock); cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); + + if (!status) + status = nvmet_tcp_tls_key_lookup(queue, peerid); + if (status) nvmet_tcp_schedule_release_queue(queue); else diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index f1cfe0bb89b2..0d29470e86b0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1149,7 +1149,7 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + count = __blk_rq_map_sg(rq, cmd->sdb.table.sgl, &last_sg); if (blk_rq_bytes(rq) & rq->q->limits.dma_pad_mask) { unsigned int pad_len = diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index c8dc92a7d63e..73564efd11d2 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -167,18 +167,6 @@ static int iblock_configure_device(struct se_device *dev) break; } - if (dev->dev_attrib.pi_prot_type) { - struct bio_set *bs = &ib_dev->ibd_bio_set; - - if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { - pr_err("Unable to allocate bioset for PI\n"); - ret = -ENOMEM; - goto out_blkdev_put; - } - pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - &bs->bio_integrity_pool); - } - dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; return 0; diff --git a/drivers/ufs/core/ufshcd-crypto.c b/drivers/ufs/core/ufshcd-crypto.c index 694ff7578fc1..9e63a9d3cb7e 100644 --- a/drivers/ufs/core/ufshcd-crypto.c +++ b/drivers/ufs/core/ufshcd-crypto.c @@ -72,11 +72,11 @@ static int ufshcd_crypto_keyslot_program(struct blk_crypto_profile *profile, if (ccap_array[cap_idx].algorithm_id == UFS_CRYPTO_ALG_AES_XTS) { /* In XTS mode, the blk_crypto_key's size is already doubled */ - memcpy(cfg.crypto_key, key->raw, key->size/2); + memcpy(cfg.crypto_key, key->bytes, key->size/2); memcpy(cfg.crypto_key + UFS_CRYPTO_KEY_MAX_SIZE/2, - key->raw + key->size/2, key->size/2); + key->bytes + key->size/2, key->size/2); } else { - memcpy(cfg.crypto_key, key->raw, key->size); + memcpy(cfg.crypto_key, key->bytes, key->size); } ufshcd_program_key(hba, &cfg, slot); @@ -185,6 +185,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) hba->crypto_profile.ll_ops = ufshcd_crypto_ops; /* UFS only supports 8 bytes for any DUN */ hba->crypto_profile.max_dun_bytes_supported = 8; + hba->crypto_profile.key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; hba->crypto_profile.dev = hba->dev; /* diff --git a/drivers/ufs/host/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c index 13dd5dfc03eb..6a415d9bdc85 100644 --- a/drivers/ufs/host/ufs-exynos.c +++ b/drivers/ufs/host/ufs-exynos.c @@ -1320,6 +1320,7 @@ static void exynos_ufs_fmp_init(struct ufs_hba *hba, struct exynos_ufs *ufs) return; } profile->max_dun_bytes_supported = AES_BLOCK_SIZE; + profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; profile->dev = hba->dev; profile->modes_supported[BLK_ENCRYPTION_MODE_AES_256_XTS] = DATA_UNIT_SIZE; @@ -1366,7 +1367,7 @@ static int exynos_ufs_fmp_fill_prdt(struct ufs_hba *hba, void *prdt, unsigned int num_segments) { struct fmp_sg_entry *fmp_prdt = prdt; - const u8 *enckey = crypt_ctx->bc_key->raw; + const u8 *enckey = crypt_ctx->bc_key->bytes; const u8 *twkey = enckey + AES_KEYSIZE_256; u64 dun_lo = crypt_ctx->bc_dun[0]; u64 dun_hi = crypt_ctx->bc_dun[1]; diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 2cfa1774944b..cc9578e1f38e 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -147,6 +147,7 @@ static int ufs_qcom_ice_init(struct ufs_qcom_host *host) profile->ll_ops = ufs_qcom_crypto_ops; profile->max_dun_bytes_supported = 8; + profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; profile->dev = dev; /* @@ -202,7 +203,7 @@ static int ufs_qcom_ice_keyslot_program(struct blk_crypto_profile *profile, err = qcom_ice_program_key(host->ice, QCOM_ICE_CRYPTO_ALG_AES_XTS, QCOM_ICE_CRYPTO_KEY_SIZE_256, - key->raw, + key->bytes, key->crypto_cfg.data_unit_size / 512, slot); ufshcd_release(hba); diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 332d828fe6fa..b5dfb0aa405a 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -3,6 +3,7 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO select CRYPTO_HASH + select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 select KEYS diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 5a384dad2c72..855a0f4b7318 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation - * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): - * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". - * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC @@ -11,6 +7,7 @@ #include <crypto/hash.h> #include <crypto/sha2.h> +#include <crypto/hkdf.h> #include "fscrypt_private.h" @@ -44,20 +41,6 @@ * there's no way to persist a random salt per master key from kernel mode. */ -/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ -static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) -{ - static const u8 default_salt[HKDF_HASHLEN]; - int err; - - err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); - if (err) - return err; - - return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); -} - /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. @@ -69,6 +52,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; + static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; int err; @@ -84,7 +68,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, goto err_free_tfm; } - err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + err = hkdf_extract(hmac_tfm, master_key, master_key_size, + default_salt, HKDF_HASHLEN, prk); if (err) goto err_free_tfm; @@ -118,61 +103,21 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 prefix[9]; - unsigned int i; + u8 *full_info; int err; - const u8 *prev = NULL; - u8 counter = 1; - u8 tmp[HKDF_HASHLEN]; - - if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) - return -EINVAL; + full_info = kzalloc(infolen + 9, GFP_KERNEL); + if (!full_info) + return -ENOMEM; desc->tfm = hkdf->hmac_tfm; - memcpy(prefix, "fscrypt\0", 8); - prefix[8] = context; - - for (i = 0; i < okmlen; i += HKDF_HASHLEN) { - - err = crypto_shash_init(desc); - if (err) - goto out; - - if (prev) { - err = crypto_shash_update(desc, prev, HKDF_HASHLEN); - if (err) - goto out; - } - - err = crypto_shash_update(desc, prefix, sizeof(prefix)); - if (err) - goto out; - - err = crypto_shash_update(desc, info, infolen); - if (err) - goto out; - - BUILD_BUG_ON(sizeof(counter) != 1); - if (okmlen - i < HKDF_HASHLEN) { - err = crypto_shash_finup(desc, &counter, 1, tmp); - if (err) - goto out; - memcpy(&okm[i], tmp, okmlen - i); - memzero_explicit(tmp, sizeof(tmp)); - } else { - err = crypto_shash_finup(desc, &counter, 1, &okm[i]); - if (err) - goto out; - } - counter++; - prev = &okm[i]; - } - err = 0; -out: - if (unlikely(err)) - memzero_explicit(okm, okmlen); /* so caller doesn't need to */ - shash_desc_zero(desc); + memcpy(full_info, "fscrypt\0", 8); + full_info[8] = context; + memcpy(full_info + 9, info, infolen); + + err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, + okm, okmlen); + kfree_sensitive(full_info); return err; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 40de69860dcf..7fa53d30aec3 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -130,6 +130,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); + crypto_cfg.key_type = BLK_CRYPTO_KEY_TYPE_RAW; devs = fscrypt_get_devices(sb, &num_devs); if (IS_ERR(devs)) @@ -166,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, if (!blk_key) return -ENOMEM; - err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, + err = blk_crypto_init_key(blk_key, raw_key, ci->ci_mode->keysize, + BLK_CRYPTO_KEY_TYPE_RAW, crypto_mode, fscrypt_get_dun_bytes(ci), 1U << ci->ci_data_unit_bits); if (err) { diff --git a/include/crypto/hkdf.h b/include/crypto/hkdf.h new file mode 100644 index 000000000000..6a9678f508f5 --- /dev/null +++ b/include/crypto/hkdf.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HKDF: HMAC-based Key Derivation Function (HKDF), RFC 5869 + * + * Extracted from fs/crypto/hkdf.c, which has + * Copyright 2019 Google LLC + */ + +#ifndef _CRYPTO_HKDF_H +#define _CRYPTO_HKDF_H + +#include <crypto/hash.h> + +int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, + unsigned int ikmlen, const u8 *salt, unsigned int saltlen, + u8 *prk); +int hkdf_expand(struct crypto_shash *hmac_tfm, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); +#endif diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 670f2dae692f..996493917f36 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -48,11 +48,11 @@ struct badblocks_context { int ack; }; -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors); +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, + int acknowledged); +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors); void ack_all_badblocks(struct badblocks *bb); ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 802f52e38efd..0a25716820fe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -16,8 +16,6 @@ enum bip_flags { }; struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ @@ -25,12 +23,7 @@ struct bio_integrity_payload { unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ @@ -74,6 +67,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, bip->bip_iter.bi_sector = seed; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, @@ -85,9 +80,6 @@ bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); -int bioset_integrity_create(struct bio_set *bs, int pool_size); -void bioset_integrity_free(struct bio_set *bs); -void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ @@ -96,15 +88,6 @@ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) return NULL; } -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free(struct bio_set *bs) -{ -} - static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; @@ -139,10 +122,6 @@ static inline void bio_integrity_trim(struct bio *bio) { } -static inline void bio_integrity_init(void) -{ -} - static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4b79bf50f4f0..cafc7c215de8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -625,10 +625,6 @@ struct bio_set { mempool_t bio_pool; mempool_t bvec_pool; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t bio_integrity_pool; - mempool_t bvec_integrity_pool; -#endif unsigned int back_pad; /* diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index 90ab33cb5d0e..4f39e9cd7576 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -57,6 +57,62 @@ struct blk_crypto_ll_ops { int (*keyslot_evict)(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot); + + /** + * @derive_sw_secret: Derive the software secret from a hardware-wrapped + * key in ephemerally-wrapped form. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * Must return 0 on success, -EBADMSG if the key is invalid, or another + * -errno code on other errors. + */ + int (*derive_sw_secret)(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + + /** + * @import_key: Create a hardware-wrapped key by importing a raw key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*import_key)(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @generate_key: Generate a hardware-wrapped key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*generate_key)(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @prepare_key: Prepare a hardware-wrapped key to be used. + * + * Prepare a hardware-wrapped key to be used by converting it from + * long-term wrapped form to ephemerally-wrapped form. This only needs + * to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED is supported. + * + * On success, must write the key in ephemerally-wrapped form to + * @eph_key and return its size in bytes. On failure, must return + * -EBADMSG if the key is invalid, or another -errno on other error. + */ + int (*prepare_key)(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); }; /** @@ -85,6 +141,12 @@ struct blk_crypto_profile { unsigned int max_dun_bytes_supported; /** + * @key_types_supported: A bitmask of the supported key types: + * BLK_CRYPTO_KEY_TYPE_RAW and/or BLK_CRYPTO_KEY_TYPE_HW_WRAPPED. + */ + unsigned int key_types_supported; + + /** * @modes_supported: Array of bitmasks that specifies whether each * combination of crypto mode and data unit size is supported. * Specifically, the i'th bit of modes_supported[crypto_mode] is set if @@ -143,6 +205,17 @@ void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, const struct blk_crypto_profile *child); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 5e5822c18ee4..58b0c5254a67 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -6,7 +6,9 @@ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H +#include <linux/minmax.h> #include <linux/types.h> +#include <uapi/linux/blk-crypto.h> enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, @@ -17,7 +19,55 @@ enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_MAX, }; -#define BLK_CRYPTO_MAX_KEY_SIZE 64 +/* + * Supported types of keys. Must be bitflags due to their use in + * blk_crypto_profile::key_types_supported. + */ +enum blk_crypto_key_type { + /* + * Raw keys (i.e. "software keys"). These keys are simply kept in raw, + * plaintext form in kernel memory. + */ + BLK_CRYPTO_KEY_TYPE_RAW = 0x1, + + /* + * Hardware-wrapped keys. These keys are only present in kernel memory + * in ephemerally-wrapped form, and they can only be unwrapped by + * dedicated hardware. For details, see the "Hardware-wrapped keys" + * section of Documentation/block/inline-encryption.rst. + */ + BLK_CRYPTO_KEY_TYPE_HW_WRAPPED = 0x2, +}; + +/* + * Currently the maximum raw key size is 64 bytes, as that is the key size of + * BLK_ENCRYPTION_MODE_AES_256_XTS which takes the longest key. + * + * The maximum hardware-wrapped key size depends on the hardware's key wrapping + * algorithm, which is a hardware implementation detail, so it isn't precisely + * specified. But currently 128 bytes is plenty in practice. Implementations + * are recommended to wrap a 32-byte key for the hardware KDF with AES-256-GCM, + * which should result in a size closer to 64 bytes than 128. + * + * Both of these values can trivially be increased if ever needed. + */ +#define BLK_CRYPTO_MAX_RAW_KEY_SIZE 64 +#define BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE 128 + +#define BLK_CRYPTO_MAX_ANY_KEY_SIZE \ + MAX(BLK_CRYPTO_MAX_RAW_KEY_SIZE, BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + +/* + * Size of the "software secret" which can be derived from a hardware-wrapped + * key. This is currently always 32 bytes. Note, the choice of 32 bytes + * assumes that the software secret is only used directly for algorithms that + * don't require more than a 256-bit key to get the desired security strength. + * If it were to be used e.g. directly as an AES-256-XTS key, then this would + * need to be increased (which is possible if hardware supports it, but care + * would need to be taken to avoid breaking users who need exactly 32 bytes). + */ +#define BLK_CRYPTO_SW_SECRET_SIZE 32 + /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for @@ -26,20 +76,23 @@ enum blk_crypto_mode_num { * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key + * @key_type: the type of this key -- either raw or hardware-wrapped */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; + enum blk_crypto_key_type key_type; }; /** * struct blk_crypto_key - an inline encryption key - * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this - * key + * @crypto_cfg: the crypto mode, data unit size, key type, and other + * characteristics of this key and how it will be used * @data_unit_size_bits: log2 of data_unit_size - * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) - * @raw: the raw bytes of this key. Only the first @size bytes are used. + * @size: size of this key in bytes. The size of a raw key is fixed for a given + * crypto mode, but the size of a hardware-wrapped key can vary. + * @bytes: the bytes of this key. Only the first @size bytes are significant. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed @@ -49,7 +102,7 @@ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; - u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; + u8 bytes[BLK_CRYPTO_MAX_ANY_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 @@ -87,7 +140,9 @@ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *key_bytes, size_t key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); @@ -103,6 +158,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev, bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index aba9c24486aa..8eb9b3310167 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1173,14 +1173,13 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) return max_t(unsigned short, rq->nr_phys_segments, 1); } -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg); -static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; - return __blk_rq_map_sg(q, rq, sglist, &last_sg); + return __blk_rq_map_sg(rq, sglist, &last_sg); } void blk_dump_rq_flags(struct request *, char *); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1c0cf6af392c..e39c45bc0a97 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -568,7 +568,22 @@ struct request_queue { struct blk_flush_queue *fq; struct list_head flush_list; + /* + * Protects against I/O scheduler switching, particularly when updating + * q->elevator. Since the elevator update code path may also modify q-> + * nr_requests and wbt latency, this lock also protects the sysfs attrs + * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update + * may modify hctx tags, reserved-tags and cpumask, so this lock also + * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking + * order during an elevator or nr_hw_queue update, first freeze the + * queue, then acquire ->elevator_lock. + */ + struct mutex elevator_lock; + struct mutex sysfs_lock; + /* + * Protects queue limits and also sysfs attribute read_ahead_kb. + */ struct mutex limits_lock; /* diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index c1d0bc5d9624..60e069a6757f 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -40,5 +40,12 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len); +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, + u8 **ret_psk, size_t *ret_len); +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest); +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk); #endif /* _NVME_AUTH_H */ diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index 19d2b256180f..ab8971afa973 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -6,15 +6,25 @@ #ifndef _NVME_KEYRING_H #define _NVME_KEYRING_H +#include <linux/key.h> + #if IS_ENABLED(CONFIG_NVME_KEYRING) +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest); key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); struct key *nvme_tls_key_lookup(key_serial_t key_id); #else - +static inline struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + return ERR_PTR(-ENOTSUPP); +} static inline key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn) { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2dc05b1c3283..2479ed10f53e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1772,6 +1772,13 @@ enum { NVME_AUTH_DHGROUP_INVALID = 0xff, }; +enum { + NVME_AUTH_SECP_NOSC = 0x00, + NVME_AUTH_SECP_SC = 0x01, + NVME_AUTH_SECP_NEWTLSPSK = 0x02, + NVME_AUTH_SECP_REPLACETLSPSK = 0x03, +}; + union nvmf_auth_protocol { struct nvmf_auth_dhchap_protocol_descriptor dhchap; }; diff --git a/include/linux/wait.h b/include/linux/wait.h index 3503fe822e38..965a19809c7e 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1210,14 +1210,16 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) -#define init_wait(wait) \ +#define init_wait_func(wait, function) \ do { \ (wait)->private = current; \ - (wait)->func = autoremove_wake_function; \ + (wait)->func = function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) +#define init_wait(wait) init_wait_func(wait, autoremove_wake_function) + typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); diff --git a/include/uapi/linux/blk-crypto.h b/include/uapi/linux/blk-crypto.h new file mode 100644 index 000000000000..97302c6eb6af --- /dev/null +++ b/include/uapi/linux/blk-crypto.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_BLK_CRYPTO_H +#define _UAPI_LINUX_BLK_CRYPTO_H + +#include <linux/ioctl.h> +#include <linux/types.h> + +struct blk_crypto_import_key_arg { + /* Raw key (input) */ + __u64 raw_key_ptr; + __u64 raw_key_size; + /* Long-term wrapped key blob (output) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + __u64 reserved[4]; +}; + +struct blk_crypto_generate_key_arg { + /* Long-term wrapped key blob (output) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + __u64 reserved[4]; +}; + +struct blk_crypto_prepare_key_arg { + /* Long-term wrapped key blob (input) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + /* Ephemerally-wrapped key blob (output) */ + __u64 eph_key_ptr; + __u64 eph_key_size; + __u64 reserved[4]; +}; + +/* + * These ioctls share the block device ioctl space; see uapi/linux/fs.h. + * 140-141 are reserved for future blk-crypto ioctls; any more than that would + * require an additional allocation from the block device ioctl space. + */ +#define BLKCRYPTOIMPORTKEY _IOWR(0x12, 137, struct blk_crypto_import_key_arg) +#define BLKCRYPTOGENERATEKEY _IOWR(0x12, 138, struct blk_crypto_generate_key_arg) +#define BLKCRYPTOPREPAREKEY _IOWR(0x12, 139, struct blk_crypto_prepare_key_arg) + +#endif /* _UAPI_LINUX_BLK_CRYPTO_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 2bbe00cf1248..e762e1af650c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -212,10 +212,8 @@ struct fsxattr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) #define BLKGETDISKSEQ _IOR(0x12,128,__u64) -/* - * A jump here: 130-136 are reserved for zoned block devices - * (see uapi/linux/blkzoned.h) - */ +/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ +/* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 74246c926b55..7255b36b5cf6 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -405,6 +405,11 @@ struct ublk_param_zoned { __u8 reserved[20]; }; +struct ublk_param_dma_align { + __u32 alignment; + __u8 pad[4]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -417,12 +422,14 @@ struct ublk_params { #define UBLK_PARAM_TYPE_DISCARD (1 << 1) #define UBLK_PARAM_TYPE_DEVT (1 << 2) #define UBLK_PARAM_TYPE_ZONED (1 << 3) +#define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; struct ublk_param_discard discard; struct ublk_param_devt devt; struct ublk_param_zoned zoned; + struct ublk_param_dma_align dma; }; #endif |