diff options
1640 files changed, 35755 insertions, 24549 deletions
@@ -33,6 +33,8 @@ Al Viro <viro@zenIV.linux.org.uk> Andi Kleen <ak@linux.intel.com> <ak@suse.de> Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com> Andreas Herrmann <aherrman@de.ibm.com> +Andrej Shadura <andrew.shadura@collabora.co.uk> +Andrej Shadura <andrew@shadura.me> <andrew@beldisplaytech.com> Andrew Morton <akpm@linux-foundation.org> Andrew Murray <amurray@thegoodpenguin.co.uk> <amurray@embedded-bits.co.uk> Andrew Murray <amurray@thegoodpenguin.co.uk> <andrew.murray@arm.com> @@ -971,6 +971,7 @@ D: PowerPC N: Daniel Drake E: dsd@gentoo.org D: USBAT02 CompactFlash support in usb-storage +D: ZD1211RW wireless driver S: UK N: Oleg Drokin diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index d5b0e8aa043a..81d37ac7132c 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1234,7 +1234,7 @@ PAGE_SIZE multiple when read back. Note that all fields in this file are hierarchical and the file modified event can be generated due to an event down the - hierarchy. For for the local events at the cgroup level see + hierarchy. For the local events at the cgroup level see memory.events.local. low @@ -2178,19 +2178,19 @@ existing device files. Cgroup v2 device controller has no interface files and is implemented on top of cgroup BPF. To control access to device files, a user may -create bpf programs of the BPF_CGROUP_DEVICE type and attach them -to cgroups. On an attempt to access a device file, corresponding -BPF programs will be executed, and depending on the return value -the attempt will succeed or fail with -EPERM. - -A BPF_CGROUP_DEVICE program takes a pointer to the bpf_cgroup_dev_ctx -structure, which describes the device access attempt: access type -(mknod/read/write) and device (type, major and minor numbers). -If the program returns 0, the attempt fails with -EPERM, otherwise -it succeeds. - -An example of BPF_CGROUP_DEVICE program may be found in the kernel -source tree in the tools/testing/selftests/bpf/progs/dev_cgroup.c file. +create bpf programs of type BPF_PROG_TYPE_CGROUP_DEVICE and attach +them to cgroups with BPF_CGROUP_DEVICE flag. On an attempt to access a +device file, corresponding BPF programs will be executed, and depending +on the return value the attempt will succeed or fail with -EPERM. + +A BPF_PROG_TYPE_CGROUP_DEVICE program takes a pointer to the +bpf_cgroup_dev_ctx structure, which describes the device access attempt: +access type (mknod/read/write) and device (type, major and minor numbers). +If the program returns 0, the attempt fails with -EPERM, otherwise it +succeeds. + +An example of BPF_PROG_TYPE_CGROUP_DEVICE program may be found in +tools/testing/selftests/bpf/progs/dev_cgroup.c in the kernel source tree. RDMA diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 91ba391f9b32..43dc35fe5bc0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1266,7 +1266,7 @@ The VGA and EFI output is eventually overwritten by the real console. - The xen output can only be used by Xen PV guests. + The xen option can only be used in Xen domains. The sclp output can only be used on s390. diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 7f9b40d6b416..4d151fbe2058 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 +.. _inline_encryption: + ================= Inline Encryption ================= @@ -7,230 +9,269 @@ Inline Encryption Background ========== -Inline encryption hardware sits logically between memory and the disk, and can -en/decrypt data as it goes in/out of the disk. Inline encryption hardware has a -fixed number of "keyslots" - slots into which encryption contexts (i.e. the -encryption key, encryption algorithm, data unit size) can be programmed by the -kernel at any time. Each request sent to the disk can be tagged with the index -of a keyslot (and also a data unit number to act as an encryption tweak), and -the inline encryption hardware will en/decrypt the data in the request with the -encryption context programmed into that keyslot. This is very different from -full disk encryption solutions like self encrypting drives/TCG OPAL/ATA -Security standards, since with inline encryption, any block on disk could be -encrypted with any encryption context the kernel chooses. - +Inline encryption hardware sits logically between memory and disk, and can +en/decrypt data as it goes in/out of the disk. For each I/O request, software +can control exactly how the inline encryption hardware will en/decrypt the data +in terms of key, algorithm, data unit size (the granularity of en/decryption), +and data unit number (a value that determines the initialization vector(s)). + +Some inline encryption hardware accepts all encryption parameters including raw +keys directly in low-level I/O requests. However, most inline encryption +hardware instead has a fixed number of "keyslots" and requires that the key, +algorithm, and data unit size first be programmed into a keyslot. Each +low-level I/O request then just contains a keyslot index and data unit number. + +Note that inline encryption hardware is very different from traditional crypto +accelerators, which are supported through the kernel crypto API. Traditional +crypto accelerators operate on memory regions, whereas inline encryption +hardware operates on I/O requests. Thus, inline encryption hardware needs to be +managed by the block layer, not the kernel crypto API. + +Inline encryption hardware is also very different from "self-encrypting drives", +such as those based on the TCG Opal or ATA Security standards. Self-encrypting +drives don't provide fine-grained control of encryption and provide no way to +verify the correctness of the resulting ciphertext. Inline encryption hardware +provides fine-grained control of encryption, including the choice of key and +initialization vector for each sector, and can be tested for correctness. Objective ========= -We want to support inline encryption (IE) in the kernel. -To allow for testing, we also want a crypto API fallback when actual -IE hardware is absent. We also want IE to work with layered devices -like dm and loopback (i.e. we want to be able to use the IE hardware -of the underlying devices if present, or else fall back to crypto API -en/decryption). - +We want to support inline encryption in the kernel. To make testing easier, we +also want support for falling back to the kernel crypto API when actual inline +encryption hardware is absent. We also want inline encryption to work with +layered devices like device-mapper and loopback (i.e. we want to be able to use +the inline encryption hardware of the underlying devices if present, or else +fall back to crypto API en/decryption). Constraints and notes ===================== -- IE hardware has a limited number of "keyslots" that can be programmed - with an encryption context (key, algorithm, data unit size, etc.) at any time. - One can specify a keyslot in a data request made to the device, and the - device will en/decrypt the data using the encryption context programmed into - that specified keyslot. When possible, we want to make multiple requests with - the same encryption context share the same keyslot. - -- We need a way for upper layers like filesystems to specify an encryption - context to use for en/decrypting a struct bio, and a device driver (like UFS) - needs to be able to use that encryption context when it processes the bio. - -- We need a way for device drivers to expose their inline encryption - capabilities in a unified way to the upper layers. - - -Design -====== - -We add a struct bio_crypt_ctx to struct bio that can -represent an encryption context, because we need to be able to pass this -encryption context from the upper layers (like the fs layer) to the -device driver to act upon. - -While IE hardware works on the notion of keyslots, the FS layer has no -knowledge of keyslots - it simply wants to specify an encryption context to -use while en/decrypting a bio. - -We introduce a keyslot manager (KSM) that handles the translation from -encryption contexts specified by the FS to keyslots on the IE hardware. -This KSM also serves as the way IE hardware can expose its capabilities to -upper layers. The generic mode of operation is: each device driver that wants -to support IE will construct a KSM and set it up in its struct request_queue. -Upper layers that want to use IE on this device can then use this KSM in -the device's struct request_queue to translate an encryption context into -a keyslot. The presence of the KSM in the request queue shall be used to mean -that the device supports IE. - -The KSM uses refcounts to track which keyslots are idle (either they have no -encryption context programmed, or there are no in-flight struct bios -referencing that keyslot). When a new encryption context needs a keyslot, it -tries to find a keyslot that has already been programmed with the same -encryption context, and if there is no such keyslot, it evicts the least -recently used idle keyslot and programs the new encryption context into that -one. If no idle keyslots are available, then the caller will sleep until there -is at least one. - - -blk-mq changes, other block layer changes and blk-crypto-fallback -================================================================= - -We add a pointer to a ``bi_crypt_context`` and ``keyslot`` to -struct request. These will be referred to as the ``crypto fields`` -for the request. This ``keyslot`` is the keyslot into which the -``bi_crypt_context`` has been programmed in the KSM of the ``request_queue`` -that this request is being sent to. - -We introduce ``block/blk-crypto-fallback.c``, which allows upper layers to remain -blissfully unaware of whether or not real inline encryption hardware is present -underneath. When a bio is submitted with a target ``request_queue`` that doesn't -support the encryption context specified with the bio, the block layer will -en/decrypt the bio with the blk-crypto-fallback. - -If the bio is a ``WRITE`` bio, a bounce bio is allocated, and the data in the bio -is encrypted stored in the bounce bio - blk-mq will then proceed to process the -bounce bio as if it were not encrypted at all (except when blk-integrity is -concerned). ``blk-crypto-fallback`` sets the bounce bio's ``bi_end_io`` to an -internal function that cleans up the bounce bio and ends the original bio. - -If the bio is a ``READ`` bio, the bio's ``bi_end_io`` (and also ``bi_private``) -is saved and overwritten by ``blk-crypto-fallback`` to -``bio_crypto_fallback_decrypt_bio``. The bio's ``bi_crypt_context`` is also -overwritten with ``NULL``, so that to the rest of the stack, the bio looks -as if it was a regular bio that never had an encryption context specified. -``bio_crypto_fallback_decrypt_bio`` will decrypt the bio, restore the original -``bi_end_io`` (and also ``bi_private``) and end the bio again. - -Regardless of whether real inline encryption hardware is used or the +- We need a way for upper layers (e.g. filesystems) to specify an encryption + context to use for en/decrypting a bio, and device drivers (e.g. UFSHCD) need + to be able to use that encryption context when they process the request. + Encryption contexts also introduce constraints on bio merging; the block layer + needs to be aware of these constraints. + +- Different inline encryption hardware has different supported algorithms, + supported data unit sizes, maximum data unit numbers, etc. We call these + properties the "crypto capabilities". We need a way for device drivers to + advertise crypto capabilities to upper layers in a generic way. + +- Inline encryption hardware usually (but not always) requires that keys be + programmed into keyslots before being used. Since programming keyslots may be + slow and there may not be very many keyslots, we shouldn't just program the + key for every I/O request, but rather keep track of which keys are in the + keyslots and reuse an already-programmed keyslot when possible. + +- Upper layers typically define a specific end-of-life for crypto keys, e.g. + when an encrypted directory is locked or when a crypto mapping is torn down. + At these times, keys are wiped from memory. We must provide a way for upper + layers to also evict keys from any keyslots they are present in. + +- When possible, device-mapper devices must be able to pass through the inline + encryption support of their underlying devices. However, it doesn't make + sense for device-mapper devices to have keyslots themselves. + +Basic design +============ + +We introduce ``struct blk_crypto_key`` to represent an inline encryption key and +how it will be used. This includes the actual bytes of the key; the size of the +key; the algorithm and data unit size the key will be used with; and the number +of bytes needed to represent the maximum data unit number the key will be used +with. + +We introduce ``struct bio_crypt_ctx`` to represent an encryption context. It +contains a data unit number and a pointer to a blk_crypto_key. We add pointers +to a bio_crypt_ctx to ``struct bio`` and ``struct request``; this allows users +of the block layer (e.g. filesystems) to provide an encryption context when +creating a bio and have it be passed down the stack for processing by the block +layer and device drivers. Note that the encryption context doesn't explicitly +say whether to encrypt or decrypt, as that is implicit from the direction of the +bio; WRITE means encrypt, and READ means decrypt. + +We also introduce ``struct blk_crypto_profile`` to contain all generic inline +encryption-related state for a particular inline encryption device. The +blk_crypto_profile serves as the way that drivers for inline encryption hardware +advertise their crypto capabilities and provide certain functions (e.g., +functions to program and evict keys) to upper layers. Each device driver that +wants to support inline encryption will construct a blk_crypto_profile, then +associate it with the disk's request_queue. + +The blk_crypto_profile also manages the hardware's keyslots, when applicable. +This happens in the block layer, so that users of the block layer can just +specify encryption contexts and don't need to know about keyslots at all, nor do +device drivers need to care about most details of keyslot management. + +Specifically, for each keyslot, the block layer (via the blk_crypto_profile) +keeps track of which blk_crypto_key that keyslot contains (if any), and how many +in-flight I/O requests are using it. When the block layer creates a +``struct request`` for a bio that has an encryption context, it grabs a keyslot +that already contains the key if possible. Otherwise it waits for an idle +keyslot (a keyslot that isn't in-use by any I/O), then programs the key into the +least-recently-used idle keyslot using the function the device driver provided. +In both cases, the resulting keyslot is stored in the ``crypt_keyslot`` field of +the request, where it is then accessible to device drivers and is released after +the request completes. + +``struct request`` also contains a pointer to the original bio_crypt_ctx. +Requests can be built from multiple bios, and the block layer must take the +encryption context into account when trying to merge bios and requests. For two +bios/requests to be merged, they must have compatible encryption contexts: both +unencrypted, or both encrypted with the same key and contiguous data unit +numbers. Only the encryption context for the first bio in a request is +retained, since the remaining bios have been verified to be merge-compatible +with the first bio. + +To make it possible for inline encryption to work with request_queue based +layered devices, when a request is cloned, its encryption context is cloned as +well. When the cloned request is submitted, it is then processed as usual; this +includes getting a keyslot from the clone's target device if needed. + +blk-crypto-fallback +=================== + +It is desirable for the inline encryption support of upper layers (e.g. +filesystems) to be testable without real inline encryption hardware, and +likewise for the block layer's keyslot management logic. It is also desirable +to allow upper layers to just always use inline encryption rather than have to +implement encryption in multiple ways. + +Therefore, we also introduce *blk-crypto-fallback*, which is an implementation +of inline encryption using the kernel crypto API. blk-crypto-fallback is built +into the block layer, so it works on any block device without any special setup. +Essentially, when a bio with an encryption context is submitted to a +request_queue that doesn't support that encryption context, the block layer will +handle en/decryption of the bio using blk-crypto-fallback. + +For encryption, the data cannot be encrypted in-place, as callers usually rely +on it being unmodified. Instead, blk-crypto-fallback allocates bounce pages, +fills a new bio with those bounce pages, encrypts the data into those bounce +pages, and submits that "bounce" bio. When the bounce bio completes, +blk-crypto-fallback completes the original bio. If the original bio is too +large, multiple bounce bios may be required; see the code for details. + +For decryption, blk-crypto-fallback "wraps" the bio's completion callback +(``bi_complete``) and private data (``bi_private``) with its own, unsets the +bio's encryption context, then submits the bio. If the read completes +successfully, blk-crypto-fallback restores the bio's original completion +callback and private data, then decrypts the bio's data in-place using the +kernel crypto API. Decryption happens from a workqueue, as it may sleep. +Afterwards, blk-crypto-fallback completes the bio. + +In both cases, the bios that blk-crypto-fallback submits no longer have an +encryption context. Therefore, lower layers only see standard unencrypted I/O. + +blk-crypto-fallback also defines its own blk_crypto_profile and has its own +"keyslots"; its keyslots contain ``struct crypto_skcipher`` objects. The reason +for this is twofold. First, it allows the keyslot management logic to be tested +without actual inline encryption hardware. Second, similar to actual inline +encryption hardware, the crypto API doesn't accept keys directly in requests but +rather requires that keys be set ahead of time, and setting keys can be +expensive; moreover, allocating a crypto_skcipher can't happen on the I/O path +at all due to the locks it takes. Therefore, the concept of keyslots still +makes sense for blk-crypto-fallback. + +Note that regardless of whether real inline encryption hardware or blk-crypto-fallback is used, the ciphertext written to disk (and hence the -on-disk format of data) will be the same (assuming the hardware's implementation -of the algorithm being used adheres to spec and functions correctly). - -If a ``request queue``'s inline encryption hardware claimed to support the -encryption context specified with a bio, then it will not be handled by the -``blk-crypto-fallback``. We will eventually reach a point in blk-mq when a -struct request needs to be allocated for that bio. At that point, -blk-mq tries to program the encryption context into the ``request_queue``'s -keyslot_manager, and obtain a keyslot, which it stores in its newly added -``keyslot`` field. This keyslot is released when the request is completed. - -When the first bio is added to a request, ``blk_crypto_rq_bio_prep`` is called, -which sets the request's ``crypt_ctx`` to a copy of the bio's -``bi_crypt_context``. bio_crypt_do_front_merge is called whenever a subsequent -bio is merged to the front of the request, which updates the ``crypt_ctx`` of -the request so that it matches the newly merged bio's ``bi_crypt_context``. In particular, the request keeps a copy of the ``bi_crypt_context`` of the first -bio in its bio-list (blk-mq needs to be careful to maintain this invariant -during bio and request merges). - -To make it possible for inline encryption to work with request queue based -layered devices, when a request is cloned, its ``crypto fields`` are cloned as -well. When the cloned request is submitted, blk-mq programs the -``bi_crypt_context`` of the request into the clone's request_queue's keyslot -manager, and stores the returned keyslot in the clone's ``keyslot``. +on-disk format of data) will be the same (assuming that both the inline +encryption hardware's implementation and the kernel crypto API's implementation +of the algorithm being used adhere to spec and function correctly). +blk-crypto-fallback is optional and is controlled by the +``CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK`` kernel configuration option. API presented to users of the block layer ========================================= -``struct blk_crypto_key`` represents a crypto key (the raw key, size of the -key, the crypto algorithm to use, the data unit size to use, and the number of -bytes required to represent data unit numbers that will be specified with the -``bi_crypt_context``). - -``blk_crypto_init_key`` allows upper layers to initialize such a -``blk_crypto_key``. - -``bio_crypt_set_ctx`` should be called on any bio that a user of -the block layer wants en/decrypted via inline encryption (or the -blk-crypto-fallback, if hardware support isn't available for the desired -crypto configuration). This function takes the ``blk_crypto_key`` and the -data unit number (DUN) to use when en/decrypting the bio. - -``blk_crypto_config_supported`` allows upper layers to query whether or not the -an encryption context passed to request queue can be handled by blk-crypto -(either by real inline encryption hardware, or by the blk-crypto-fallback). -This is useful e.g. when blk-crypto-fallback is disabled, and the upper layer -wants to use an algorithm that may not supported by hardware - this function -lets the upper layer know ahead of time that the algorithm isn't supported, -and the upper layer can fallback to something else if appropriate. - -``blk_crypto_start_using_key`` - Upper layers must call this function on -``blk_crypto_key`` and a ``request_queue`` before using the key with any bio -headed for that ``request_queue``. This function ensures that either the -hardware supports the key's crypto settings, or the crypto API fallback has -transforms for the needed mode allocated and ready to go. Note that this -function may allocate an ``skcipher``, and must not be called from the data -path, since allocating ``skciphers`` from the data path can deadlock. - -``blk_crypto_evict_key`` *must* be called by upper layers before a -``blk_crypto_key`` is freed. Further, it *must* only be called only once -there are no more in-flight requests that use that ``blk_crypto_key``. -``blk_crypto_evict_key`` will ensure that a key is removed from any keyslots in -inline encryption hardware that the key might have been programmed into (or the blk-crypto-fallback). +``blk_crypto_config_supported()`` allows users to check ahead of time whether +inline encryption with particular crypto settings will work on a particular +request_queue -- either via hardware or via blk-crypto-fallback. This function +takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits +the actual bytes of the key and instead just contains the algorithm, data unit +size, etc. This function can be useful if blk-crypto-fallback is disabled. + +``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key. + +Users must call ``blk_crypto_start_using_key()`` before actually starting to use +a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()`` +was called earlier). This is needed to initialize blk-crypto-fallback if it +will be needed. This must not be called from the data path, as this may have to +allocate resources, which may deadlock in that case. + +Next, to attach an encryption context to a bio, users should call +``bio_crypt_set_ctx()``. This function allocates a bio_crypt_ctx and attaches +it to a bio, given the blk_crypto_key and the data unit number that will be used +for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx +later, as that happens automatically when the bio is freed or reset. + +Finally, when done using inline encryption with a blk_crypto_key on a +request_queue, users must call ``blk_crypto_evict_key()``. This ensures that +the key is evicted from all keyslots it may be programmed into and unlinked from +any kernel data structures it may be linked into. + +In summary, for users of the block layer, the lifecycle of a blk_crypto_key is +as follows: + +1. ``blk_crypto_config_supported()`` (optional) +2. ``blk_crypto_init_key()`` +3. ``blk_crypto_start_using_key()`` +4. ``bio_crypt_set_ctx()`` (potentially many times) +5. ``blk_crypto_evict_key()`` (after all I/O has completed) +6. Zeroize the blk_crypto_key (this has no dedicated function) + +If a blk_crypto_key is being used on multiple request_queues, then +``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``, +and ``blk_crypto_evict_key()`` must be called on each request_queue. API presented to device drivers =============================== -A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in -the ``request_queue`` of the device. The device driver needs to call -``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the -``blk_keyslot_manager``, while specifying the number of keyslots supported by -the hardware. - -The device driver also needs to tell the KSM how to actually manipulate the -IE hardware in the device to do things like programming the crypto key into -the IE hardware into a particular keyslot. All this is achieved through the -struct blk_ksm_ll_ops field in the KSM that the device driver -must fill up after initing the ``blk_keyslot_manager``. - -The KSM also handles runtime power management for the device when applicable -(e.g. when it wants to program a crypto key into the IE hardware, the device -must be runtime powered on) - so the device driver must also set the ``dev`` -field in the ksm to point to the `struct device` for the KSM to use for runtime -power management. - -``blk_ksm_reprogram_all_keys`` can be called by device drivers if the device -needs each and every of its keyslots to be reprogrammed with the key it -"should have" at the point in time when the function is called. This is useful -e.g. if a device loses all its keys on runtime power down/up. - -If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then -``blk_ksm_destroy`` should be called to free up all resources used by a -``blk_keyslot_manager`` once it is no longer needed. +A device driver that wants to support inline encryption must set up a +blk_crypto_profile in the request_queue of its device. To do this, it first +must call ``blk_crypto_profile_init()`` (or its resource-managed variant +``devm_blk_crypto_profile_init()``), providing the number of keyslots. + +Next, it must advertise its crypto capabilities by setting fields in the +blk_crypto_profile, e.g. ``modes_supported`` and ``max_dun_bytes_supported``. + +It then must set function pointers in the ``ll_ops`` field of the +blk_crypto_profile to tell upper layers how to control the inline encryption +hardware, e.g. how to program and evict keyslots. Most drivers will need to +implement ``keyslot_program`` and ``keyslot_evict``. For details, see the +comments for ``struct blk_crypto_ll_ops``. + +Once the driver registers a blk_crypto_profile with a request_queue, I/O +requests the driver receives via that queue may have an encryption context. All +encryption contexts will be compatible with the crypto capabilities declared in +the blk_crypto_profile, so drivers don't need to worry about handling +unsupported requests. Also, if a nonzero number of keyslots was declared in the +blk_crypto_profile, then all I/O requests that have an encryption context will +also have a keyslot which was already programmed with the appropriate key. + +If the driver implements runtime suspend and its blk_crypto_ll_ops don't work +while the device is runtime-suspended, then the driver must also set the ``dev`` +field of the blk_crypto_profile to point to the ``struct device`` that will be +resumed before any of the low-level operations are called. + +If there are situations where the inline encryption hardware loses the contents +of its keyslots, e.g. device resets, the driver must handle reprogramming the +keyslots. To do this, the driver may call ``blk_crypto_reprogram_all_keys()``. + +Finally, if the driver used ``blk_crypto_profile_init()`` instead of +``devm_blk_crypto_profile_init()``, then it is responsible for calling +``blk_crypto_profile_destroy()`` when the crypto profile is no longer needed. Layered Devices =============== -Request queue based layered devices like dm-rq that wish to support IE need to -create their own keyslot manager for their request queue, and expose whatever -functionality they choose. When a layered device wants to pass a clone of that -request to another ``request_queue``, blk-crypto will initialize and prepare the -clone as necessary - see ``blk_crypto_insert_cloned_request`` in -``blk-crypto.c``. - - -Future Optimizations for layered devices -======================================== - -Creating a keyslot manager for a layered device uses up memory for each -keyslot, and in general, a layered device merely passes the request on to a -"child" device, so the keyslots in the layered device itself are completely -unused, and don't need any refcounting or keyslot programming. We can instead -define a new type of KSM; the "passthrough KSM", that layered devices can use -to advertise an unlimited number of keyslots, and support for any encryption -algorithms they choose, while not actually using any memory for each keyslot. -Another use case for the "passthrough KSM" is for IE devices that do not have a -limited number of keyslots. - +Request queue based layered devices like dm-rq that wish to support inline +encryption need to create their own blk_crypto_profile for their request_queue, +and expose whatever functionality they choose. When a layered device wants to +pass a clone of that request to another request_queue, blk-crypto will +initialize and prepare the clone as necessary; see +``blk_crypto_insert_cloned_request()``. Interaction between inline encryption and blk integrity ======================================================= @@ -257,7 +298,7 @@ Because there isn't any real hardware yet, it seems prudent to assume that hardware implementations might not implement both features together correctly, and disallow the combination for now. Whenever a device supports integrity, the kernel will pretend that the device does not support hardware inline encryption -(by essentially setting the keyslot manager in the request_queue of the device -to NULL). When the crypto API fallback is enabled, this means that all bios with -and encryption context will use the fallback, and IO will complete as usual. -When the fallback is disabled, a bio with an encryption context will be failed. +(by setting the blk_crypto_profile in the request_queue of the device to NULL). +When the crypto API fallback is enabled, this means that all bios with and +encryption context will use the fallback, and IO will complete as usual. When +the fallback is disabled, a bio with an encryption context will be failed. diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index 4dc7f0d499a8..e8c74306f70a 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -4,7 +4,7 @@ Queue sysfs files This text file will detail the queue files that are located in the sysfs tree for each block device. Note that stacked devices typically do not export -any settings, since their queue merely functions are a remapping target. +any settings, since their queue merely functions as a remapping target. These files are the ones found in the /sys/block/xxx/queue/ directory. Files denoted with a RO postfix are readonly and the RW postfix means @@ -286,4 +286,35 @@ sequential zones of zoned block devices (devices with a zoned attributed that reports "host-managed" or "host-aware"). This value is always 0 for regular block devices. +independent_access_ranges (RO) +------------------------------ + +The presence of this sub-directory of the /sys/block/xxx/queue/ directory +indicates that the device is capable of executing requests targeting +different sector ranges in parallel. For instance, single LUN multi-actuator +hard-disks will have an independent_access_ranges directory if the device +correctly advertizes the sector ranges of its actuators. + +The independent_access_ranges directory contains one directory per access +range, with each range described using the sector (RO) attribute file to +indicate the first sector of the range and the nr_sectors (RO) attribute file +to indicate the total number of sectors in the range starting from the first +sector of the range. For example, a dual-actuator hard-disk will have the +following independent_access_ranges entries.:: + + $ tree /sys/block/<device>/queue/independent_access_ranges/ + /sys/block/<device>/queue/independent_access_ranges/ + |-- 0 + | |-- nr_sectors + | `-- sector + `-- 1 + |-- nr_sectors + `-- sector + +The sector and nr_sectors attributes use 512B sector unit, regardless of +the actual block size of the device. Independent access ranges do not +overlap and include all sectors within the device capacity. The access +ranges are numbered in increasing order of the range start sector, +that is, the sector attribute of range 0 always has the value 0. + Jens Axboe <jens.axboe@oracle.com>, February 2009 diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst index 5845960ca382..52ea7b6b2fe8 100644 --- a/Documentation/cdrom/cdrom-standard.rst +++ b/Documentation/cdrom/cdrom-standard.rst @@ -907,6 +907,17 @@ commands can be identified by the underscores in their names. specifies the slot for which the information is given. The special value *CDSL_CURRENT* requests that information about the currently selected slot be returned. +`CDROM_TIMED_MEDIA_CHANGE` + Checks whether the disc has been changed since a user supplied time + and returns the time of the last disc change. + + *arg* is a pointer to a *cdrom_timed_media_change_info* struct. + *arg->last_media_change* may be set by calling code to signal + the timestamp of the last known media change (by the caller). + Upon successful return, this ioctl call will set + *arg->last_media_change* to the latest media change timestamp (in ms) + known by the kernel/driver and set *arg->has_changed* to 1 if + that timestamp is more recent than the timestamp set by the caller. `CDROM_DRIVE_STATUS` Returns the status of the drive by a call to *drive_status()*. Return values are defined in cdrom_drive_status_. diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 8aed9103e48a..5c0552e78c58 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -326,6 +326,12 @@ maps this page at its virtual address. dirty. Again, see sparc64 for examples of how to deal with this. + ``void flush_dcache_folio(struct folio *folio)`` + This function is called under the same circumstances as + flush_dcache_page(). It allows the architecture to + optimise for flushing the entire folio of pages instead + of flushing one page at a time. + ``void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len)`` ``void copy_from_user_page(struct vm_area_struct *vma, struct page *page, diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst index 9c0e8758037a..d30b4d0a9769 100644 --- a/Documentation/core-api/irq/irq-domain.rst +++ b/Documentation/core-api/irq/irq-domain.rst @@ -67,9 +67,6 @@ variety of methods: deprecated - generic_handle_domain_irq() handles an interrupt described by a domain and a hwirq number -- handle_domain_irq() does the same thing for root interrupt - controllers and deals with the set_irq_reg()/irq_enter() sequences - that most architecture requires Note that irq domain lookups must happen in contexts that are compatible with a RCU read-side critical section. diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index a42f9baddfbf..395835f9289f 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -95,6 +95,11 @@ More Memory Management Functions .. kernel-doc:: mm/mempolicy.c .. kernel-doc:: include/linux/mm_types.h :internal: +.. kernel-doc:: include/linux/mm_inline.h +.. kernel-doc:: include/linux/page-flags.h .. kernel-doc:: include/linux/mm.h :internal: +.. kernel-doc:: include/linux/page_ref.h .. kernel-doc:: include/linux/mmzone.h +.. kernel-doc:: mm/util.c + :functions: folio_mapping diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml index 07b20383cbca..b446d0f0f1b4 100644 --- a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml @@ -50,7 +50,6 @@ properties: data-lanes: description: array of physical DSI data lane indexes. minItems: 1 - maxItems: 4 items: - const: 1 - const: 2 @@ -71,7 +70,6 @@ properties: data-lanes: description: array of physical DSI data lane indexes. minItems: 1 - maxItems: 4 items: - const: 1 - const: 2 diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml index 1c2daf7c24cc..911564468c5e 100644 --- a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml @@ -18,7 +18,7 @@ properties: const: ti,sn65dsi86 reg: - const: 0x2d + enum: [ 0x2c, 0x2d ] enable-gpios: maxItems: 1 diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml index 2ed010f91e2d..20ce88ab4b3a 100644 --- a/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml +++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml @@ -22,7 +22,7 @@ properties: items: - enum: # ili9341 240*320 Color on stm32f429-disco board - - st,sf-tc240t-9370-t + - st,sf-tc240t-9370-t - const: ilitek,ili9341 reg: true diff --git a/Documentation/devicetree/bindings/interrupt-controller/microchip,eic.yaml b/Documentation/devicetree/bindings/interrupt-controller/microchip,eic.yaml new file mode 100644 index 000000000000..50003880ee6f --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/microchip,eic.yaml @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interrupt-controller/microchip,eic.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip External Interrupt Controller + +maintainers: + - Claudiu Beznea <claudiu.beznea@microchip.com> + +description: + This interrupt controller is found in Microchip SoCs (SAMA7G5) and provides + support for handling up to 2 external interrupt lines. + +properties: + compatible: + enum: + - microchip,sama7g5-eic + + reg: + maxItems: 1 + + interrupt-controller: true + + '#interrupt-cells': + const: 2 + description: + The first cell is the input IRQ number (between 0 and 1), the second cell + is the trigger type as defined in interrupt.txt present in this directory. + + interrupts: + description: | + Contains the GIC SPI IRQs mapped to the external interrupt lines. They + should be specified sequentially from output 0 to output 1. + minItems: 2 + maxItems: 2 + + clocks: + maxItems: 1 + + clock-names: + const: pclk + +required: + - compatible + - reg + - interrupt-controller + - '#interrupt-cells' + - interrupts + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include <dt-bindings/clock/at91.h> + #include <dt-bindings/interrupt-controller/arm-gic.h> + + eic: interrupt-controller@e1628000 { + compatible = "microchip,sama7g5-eic"; + reg = <0xe1628000 0x100>; + interrupt-parent = <&gic>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = <GIC_SPI 153 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 154 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&pmc PMC_TYPE_PERIPHERAL 37>; + clock-names = "pclk"; + }; + +... diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml index abb22db3bb28..79d0358e2f61 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml @@ -27,6 +27,7 @@ properties: - renesas,intc-ex-r8a774a1 # RZ/G2M - renesas,intc-ex-r8a774b1 # RZ/G2N - renesas,intc-ex-r8a774c0 # RZ/G2E + - renesas,intc-ex-r8a774e1 # RZ/G2H - renesas,intc-ex-r8a7795 # R-Car H3 - renesas,intc-ex-r8a7796 # R-Car M3-W - renesas,intc-ex-r8a77961 # R-Car M3-W+ diff --git a/Documentation/devicetree/bindings/media/i2c/ovti,ov5647.yaml b/Documentation/devicetree/bindings/media/i2c/ovti,ov5647.yaml index 3e5d82df90a2..a2abed06a099 100644 --- a/Documentation/devicetree/bindings/media/i2c/ovti,ov5647.yaml +++ b/Documentation/devicetree/bindings/media/i2c/ovti,ov5647.yaml @@ -31,7 +31,7 @@ properties: maxItems: 1 port: - $ref: /schemas/graph.yaml#/properties/port + $ref: /schemas/graph.yaml#/$defs/port-base additionalProperties: false properties: diff --git a/Documentation/devicetree/bindings/media/i2c/ovti,ov9282.yaml b/Documentation/devicetree/bindings/media/i2c/ovti,ov9282.yaml index ad42992c6da3..bf115ab9d926 100644 --- a/Documentation/devicetree/bindings/media/i2c/ovti,ov9282.yaml +++ b/Documentation/devicetree/bindings/media/i2c/ovti,ov9282.yaml @@ -38,7 +38,7 @@ properties: port: additionalProperties: false - $ref: /schemas/graph.yaml#/properties/port + $ref: /schemas/graph.yaml#/$defs/port-base properties: endpoint: diff --git a/Documentation/devicetree/bindings/media/i2c/sony,imx335.yaml b/Documentation/devicetree/bindings/media/i2c/sony,imx335.yaml index 881f79532501..cf2ca2702cc9 100644 --- a/Documentation/devicetree/bindings/media/i2c/sony,imx335.yaml +++ b/Documentation/devicetree/bindings/media/i2c/sony,imx335.yaml @@ -38,7 +38,7 @@ properties: port: additionalProperties: false - $ref: /schemas/graph.yaml#/properties/port + $ref: /schemas/graph.yaml#/$defs/port-base properties: endpoint: diff --git a/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml b/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml index 1edeabf39e6a..afcf70947f7e 100644 --- a/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml +++ b/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml @@ -38,7 +38,7 @@ properties: port: additionalProperties: false - $ref: /schemas/graph.yaml#/properties/port + $ref: /schemas/graph.yaml#/$defs/port-base properties: endpoint: diff --git a/Documentation/devicetree/bindings/mfd/brcm,cru.yaml b/Documentation/devicetree/bindings/mfd/brcm,cru.yaml index fc1317ab3226..28ac60acf4ac 100644 --- a/Documentation/devicetree/bindings/mfd/brcm,cru.yaml +++ b/Documentation/devicetree/bindings/mfd/brcm,cru.yaml @@ -32,13 +32,13 @@ properties: "#size-cells": const: 1 - pinctrl: - $ref: ../pinctrl/brcm,ns-pinmux.yaml - patternProperties: '^clock-controller@[a-f0-9]+$': $ref: ../clock/brcm,iproc-clocks.yaml + '^pin-controller@[a-f0-9]+$': + $ref: ../pinctrl/brcm,ns-pinmux.yaml + '^thermal@[a-f0-9]+$': $ref: ../thermal/brcm,ns-thermal.yaml @@ -73,9 +73,10 @@ examples: "iprocfast", "sata1", "sata2"; }; - pinctrl { + pin-controller@1c0 { compatible = "brcm,bcm4708-pinmux"; - offset = <0x1c0>; + reg = <0x1c0 0x24>; + reg-names = "cru_gpio_control"; }; thermal@2c0 { diff --git a/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml b/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml index e6c9a2f77cc7..f300ced4cdf3 100644 --- a/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml +++ b/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml @@ -20,9 +20,7 @@ properties: - snps,dwcmshc-sdhci reg: - minItems: 1 - items: - - description: Offset and length of the register set for the device + maxItems: 1 interrupts: maxItems: 1 diff --git a/Documentation/devicetree/bindings/net/dsa/marvell.txt b/Documentation/devicetree/bindings/net/dsa/marvell.txt index 30c11fea491b..2363b412410c 100644 --- a/Documentation/devicetree/bindings/net/dsa/marvell.txt +++ b/Documentation/devicetree/bindings/net/dsa/marvell.txt @@ -83,7 +83,7 @@ Example: #interrupt-cells = <2>; switch0: switch@0 { - compatible = "marvell,mv88e6390"; + compatible = "marvell,mv88e6190"; reg = <0>; reset-gpios = <&gpio5 1 GPIO_ACTIVE_LOW>; diff --git a/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml b/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml index 5629b2e4ccf8..ee4afe361fac 100644 --- a/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml +++ b/Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml @@ -34,7 +34,6 @@ properties: clocks: minItems: 3 - maxItems: 5 items: - description: MAC host clock - description: MAC apb clock diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index 42689b7d03a2..c115c95ee584 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -21,6 +21,7 @@ select: contains: enum: - snps,dwmac + - snps,dwmac-3.40a - snps,dwmac-3.50a - snps,dwmac-3.610 - snps,dwmac-3.70a @@ -76,6 +77,7 @@ properties: - rockchip,rk3399-gmac - rockchip,rv1108-gmac - snps,dwmac + - snps,dwmac-3.40a - snps,dwmac-3.50a - snps,dwmac-3.610 - snps,dwmac-3.70a diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml index 2911e565b260..acea1cd444fd 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml @@ -41,7 +41,6 @@ properties: - description: builtin MSI controller. interrupt-names: - minItems: 1 items: - const: msi diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,ns-pinmux.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,ns-pinmux.yaml index 470aff599c27..fc39e3e9f71c 100644 --- a/Documentation/devicetree/bindings/pinctrl/brcm,ns-pinmux.yaml +++ b/Documentation/devicetree/bindings/pinctrl/brcm,ns-pinmux.yaml @@ -17,9 +17,6 @@ description: A list of pins varies across chipsets so few bindings are available. - Node of the pinmux must be nested in the CRU (Central Resource Unit) "syscon" - node. - properties: compatible: enum: @@ -27,10 +24,11 @@ properties: - brcm,bcm4709-pinmux - brcm,bcm53012-pinmux - offset: - description: offset of pin registers in the CRU block + reg: maxItems: 1 - $ref: /schemas/types.yaml#/definitions/uint32-array + + reg-names: + const: cru_gpio_control patternProperties: '-pins$': @@ -72,23 +70,20 @@ allOf: uart1_grp ] required: - - offset + - reg + - reg-names additionalProperties: false examples: - | - cru@1800c100 { - compatible = "syscon", "simple-mfd"; - reg = <0x1800c100 0x1a4>; - - pinctrl { - compatible = "brcm,bcm4708-pinmux"; - offset = <0xc0>; - - spi-pins { - function = "spi"; - groups = "spi_grp"; - }; + pin-controller@1800c1c0 { + compatible = "brcm,bcm4708-pinmux"; + reg = <0x1800c1c0 0x24>; + reg-names = "cru_gpio_control"; + + spi-pins { + function = "spi"; + groups = "spi_grp"; }; }; diff --git a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml index ca91201a9926..d7e08b03e204 100644 --- a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml +++ b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml @@ -171,7 +171,7 @@ examples: cs-gpios = <&gpio0 13 0>, <&gpio0 14 0>; rx-sample-delay-ns = <3>; - spi-flash@1 { + flash@1 { compatible = "spi-nand"; reg = <1>; rx-sample-delay-ns = <7>; diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index b97579b7d8fb..01df283c7d04 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios: immutable and bit-for-bit identical to the official golden image for their releases due to security and other considerations and - - hope to save some extra storage space with guaranteed end-to-end performance - by using reduced metadata and transparent file compression, especially - for those embedded devices with limited memory (ex, smartphone); + - hope to minimize extra storage space with guaranteed end-to-end performance + by using compact layout, transparent file compression and direct access, + especially for those embedded devices with limited memory and high-density + hosts with numerous containers; Here is the main features of EROFS: @@ -51,7 +52,9 @@ Here is the main features of EROFS: - Support POSIX.1e ACLs by using xattrs; - Support transparent data compression as an option: - LZ4 algorithm with the fixed-sized output compression for high performance. + LZ4 algorithm with the fixed-sized output compression for high performance; + + - Multiple device support for multi-layer container images. The following git tree provides the file system user-space tools under development (ex, formatting tool mkfs.erofs): @@ -87,6 +90,7 @@ cache_strategy=%s Select a strategy for cached decompression from now on: dax={always,never} Use direct access (no page cache). See Documentation/filesystems/dax.rst. dax A legacy option which is an alias for ``dax=always``. +device=%s Specify a path to an extra device to be used together. =================== ========================================================= On-disk details diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 0eb799d9d05a..4d5d50dca65c 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -77,11 +77,11 @@ Side-channel attacks fscrypt is only resistant to side-channel attacks, such as timing or electromagnetic attacks, to the extent that the underlying Linux -Cryptographic API algorithms are. If a vulnerable algorithm is used, -such as a table-based implementation of AES, it may be possible for an -attacker to mount a side channel attack against the online system. -Side channel attacks may also be mounted against applications -consuming decrypted data. +Cryptographic API algorithms or inline encryption hardware are. If a +vulnerable algorithm is used, such as a table-based implementation of +AES, it may be possible for an attacker to mount a side channel attack +against the online system. Side channel attacks may also be mounted +against applications consuming decrypted data. Unauthorized file access ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -176,11 +176,11 @@ Master Keys Each encrypted directory tree is protected by a *master key*. Master keys can be up to 64 bytes long, and must be at least as long as the -greater of the key length needed by the contents and filenames -encryption modes being used. For example, if AES-256-XTS is used for -contents encryption, the master key must be 64 bytes (512 bits). Note -that the XTS mode is defined to require a key twice as long as that -required by the underlying block cipher. +greater of the security strength of the contents and filenames +encryption modes being used. For example, if any AES-256 mode is +used, the master key must be at least 256 bits, i.e. 32 bytes. A +stricter requirement applies if the key is used by a v1 encryption +policy and AES-256-XTS is used; such keys must be 64 bytes. To "unlock" an encrypted directory tree, userspace must provide the appropriate master key. There can be any number of master keys, each @@ -1135,6 +1135,50 @@ where applications may later write sensitive data. It is recommended that systems implementing a form of "verified boot" take advantage of this by validating all top-level encryption policies prior to access. +Inline encryption support +========================= + +By default, fscrypt uses the kernel crypto API for all cryptographic +operations (other than HKDF, which fscrypt partially implements +itself). The kernel crypto API supports hardware crypto accelerators, +but only ones that work in the traditional way where all inputs and +outputs (e.g. plaintexts and ciphertexts) are in memory. fscrypt can +take advantage of such hardware, but the traditional acceleration +model isn't particularly efficient and fscrypt hasn't been optimized +for it. + +Instead, many newer systems (especially mobile SoCs) have *inline +encryption hardware* that can encrypt/decrypt data while it is on its +way to/from the storage device. Linux supports inline encryption +through a set of extensions to the block layer called *blk-crypto*. +blk-crypto allows filesystems to attach encryption contexts to bios +(I/O requests) to specify how the data will be encrypted or decrypted +in-line. For more information about blk-crypto, see +:ref:`Documentation/block/inline-encryption.rst <inline_encryption>`. + +On supported filesystems (currently ext4 and f2fs), fscrypt can use +blk-crypto instead of the kernel crypto API to encrypt/decrypt file +contents. To enable this, set CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y in +the kernel configuration, and specify the "inlinecrypt" mount option +when mounting the filesystem. + +Note that the "inlinecrypt" mount option just specifies to use inline +encryption when possible; it doesn't force its use. fscrypt will +still fall back to using the kernel crypto API on files where the +inline encryption hardware doesn't have the needed crypto capabilities +(e.g. support for the needed encryption algorithm and data unit size) +and where blk-crypto-fallback is unusable. (For blk-crypto-fallback +to be usable, it must be enabled in the kernel configuration with +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y.) + +Currently fscrypt always uses the filesystem block size (which is +usually 4096 bytes) as the data unit size. Therefore, it can only use +inline encryption hardware that supports that data unit size. + +Inline encryption doesn't affect the ciphertext or other aspects of +the on-disk format, so users may freely switch back and forth between +using "inlinecrypt" and not using "inlinecrypt". + Implementation details ====================== @@ -1184,6 +1228,13 @@ keys`_ and `DIRECT_KEY policies`_. Data path changes ----------------- +When inline encryption is used, filesystems just need to associate +encryption contexts with bios to specify how the block layer or the +inline encryption hardware will encrypt/decrypt the file contents. + +When inline encryption isn't used, filesystems must encrypt/decrypt +the file contents themselves, as described below: + For the read path (->readpage()) of regular files, filesystems can read the ciphertext into the page cache and decrypt it in-place. The page lock must be held until decryption has finished, to prevent the @@ -1197,18 +1248,6 @@ buffer. Some filesystems, such as UBIFS, already use temporary buffers regardless of encryption. Other filesystems, such as ext4 and F2FS, have to allocate bounce pages specially for encryption. -Fscrypt is also able to use inline encryption hardware instead of the -kernel crypto API for en/decryption of file contents. When possible, -and if directed to do so (by specifying the 'inlinecrypt' mount option -for an ext4/F2FS filesystem), it adds encryption contexts to bios and -uses blk-crypto to perform the en/decryption instead of making use of -the above read/write path changes. Of course, even if directed to -make use of inline encryption, fscrypt will only be able to do so if -either hardware inline encryption support is available for the -selected encryption algorithm or CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK -is selected. If neither is the case, fscrypt will fall back to using -the above mentioned read/write path changes for en/decryption. - Filename hashing and encoding ----------------------------- diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index c0ad233963ae..bee63d42e5ec 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -29,7 +29,6 @@ algorithms work. fiemap files locks - mandatory-locking mount_api quota seq_file diff --git a/Documentation/filesystems/locks.rst b/Documentation/filesystems/locks.rst index c5ae858b1aac..26429317dbbc 100644 --- a/Documentation/filesystems/locks.rst +++ b/Documentation/filesystems/locks.rst @@ -57,16 +57,9 @@ fcntl(), with all the problems that implies. 1.3 Mandatory Locking As A Mount Option --------------------------------------- -Mandatory locking, as described in -'Documentation/filesystems/mandatory-locking.rst' was prior to this release a -general configuration option that was valid for all mounted filesystems. This -had a number of inherent dangers, not the least of which was the ability to -freeze an NFS server by asking it to read a file for which a mandatory lock -existed. - -From this release of the kernel, mandatory locking can be turned on and off -on a per-filesystem basis, using the mount options 'mand' and 'nomand'. -The default is to disallow mandatory locking. The intention is that -mandatory locking only be enabled on a local filesystem as the specific need -arises. +Mandatory locking was prior to this release a general configuration option +that was valid for all mounted filesystems. This had a number of inherent +dangers, not the least of which was the ability to freeze an NFS server by +asking it to read a file for which a mandatory lock existed. +Such option was dropped in Kernel v5.14. diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 57a641847818..bb68d39f03b7 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -524,3 +524,5 @@ Note that these methods are passed a pointer to the cache resource structure, not the read request structure as they could be used in other situations where there isn't a read request structure as well, such as writing dirty data to the cache. + +.. kernel-doc:: include/linux/netfs.h diff --git a/Documentation/filesystems/ntfs3.rst b/Documentation/filesystems/ntfs3.rst index ffe9ea0c1499..d67ccd22c63b 100644 --- a/Documentation/filesystems/ntfs3.rst +++ b/Documentation/filesystems/ntfs3.rst @@ -4,103 +4,112 @@ NTFS3 ===== - Summary and Features ==================== -NTFS3 is fully functional NTFS Read-Write driver. The driver works with -NTFS versions up to 3.1, normal/compressed/sparse files -and journal replaying. File system type to use on mount is 'ntfs3'. +NTFS3 is fully functional NTFS Read-Write driver. The driver works with NTFS +versions up to 3.1. File system type to use on mount is *ntfs3*. - This driver implements NTFS read/write support for normal, sparse and compressed files. -- Supports native journal replaying; -- Supports extended attributes - Predefined extended attributes: - - 'system.ntfs_security' gets/sets security - descriptor (SECURITY_DESCRIPTOR_RELATIVE) - - 'system.ntfs_attrib' gets/sets ntfs file/dir attributes. - Note: applied to empty files, this allows to switch type between - sparse(0x200), compressed(0x800) and normal; +- Supports native journal replaying. - Supports NFS export of mounted NTFS volumes. +- Supports extended attributes. Predefined extended attributes: + + - *system.ntfs_security* gets/sets security + + Descriptor: SECURITY_DESCRIPTOR_RELATIVE + + - *system.ntfs_attrib* gets/sets ntfs file/dir attributes. + + Note: Applied to empty files, this allows to switch type between + sparse(0x200), compressed(0x800) and normal. Mount Options ============= The list below describes mount options supported by NTFS3 driver in addition to -generic ones. +generic ones. You can use every mount option with **no** option. If it is in +this table marked with no it means default is without **no**. -=============================================================================== +.. flat-table:: + :widths: 1 5 + :fill-cells: -nls=name This option informs the driver how to interpret path - strings and translate them to Unicode and back. If - this option is not set, the default codepage will be - used (CONFIG_NLS_DEFAULT). - Examples: - 'nls=utf8' + * - iocharset=name + - This option informs the driver how to interpret path strings and + translate them to Unicode and back. If this option is not set, the + default codepage will be used (CONFIG_NLS_DEFAULT). -uid= -gid= -umask= Controls the default permissions for files/directories created - after the NTFS volume is mounted. + Example: iocharset=utf8 -fmask= -dmask= Instead of specifying umask which applies both to - files and directories, fmask applies only to files and - dmask only to directories. + * - uid= + - :rspan:`1` + * - gid= -nohidden Files with the Windows-specific HIDDEN (FILE_ATTRIBUTE_HIDDEN) - attribute will not be shown under Linux. + * - umask= + - Controls the default permissions for files/directories created after + the NTFS volume is mounted. -sys_immutable Files with the Windows-specific SYSTEM - (FILE_ATTRIBUTE_SYSTEM) attribute will be marked as system - immutable files. + * - dmask= + - :rspan:`1` Instead of specifying umask which applies both to files and + directories, fmask applies only to files and dmask only to directories. + * - fmask= -discard Enable support of the TRIM command for improved performance - on delete operations, which is recommended for use with the - solid-state drives (SSD). + * - noacsrules + - "No access rules" mount option sets access rights for files/folders to + 777 and owner/group to root. This mount option absorbs all other + permissions. -force Forces the driver to mount partitions even if 'dirty' flag - (volume dirty) is set. Not recommended for use. + - Permissions change for files/folders will be reported as successful, + but they will remain 777. -sparse Create new files as "sparse". + - Owner/group change will be reported as successful, butthey will stay + as root. -showmeta Use this parameter to show all meta-files (System Files) on - a mounted NTFS partition. - By default, all meta-files are hidden. + * - nohidden + - Files with the Windows-specific HIDDEN (FILE_ATTRIBUTE_HIDDEN) attribute + will not be shown under Linux. -prealloc Preallocate space for files excessively when file size is - increasing on writes. Decreases fragmentation in case of - parallel write operations to different files. + * - sys_immutable + - Files with the Windows-specific SYSTEM (FILE_ATTRIBUTE_SYSTEM) attribute + will be marked as system immutable files. -no_acs_rules "No access rules" mount option sets access rights for - files/folders to 777 and owner/group to root. This mount - option absorbs all other permissions: - - permissions change for files/folders will be reported - as successful, but they will remain 777; - - owner/group change will be reported as successful, but - they will stay as root + * - discard + - Enable support of the TRIM command for improved performance on delete + operations, which is recommended for use with the solid-state drives + (SSD). -acl Support POSIX ACLs (Access Control Lists). Effective if - supported by Kernel. Not to be confused with NTFS ACLs. - The option specified as acl enables support for POSIX ACLs. + * - force + - Forces the driver to mount partitions even if volume is marked dirty. + Not recommended for use. -noatime All files and directories will not update their last access - time attribute if a partition is mounted with this parameter. - This option can speed up file system operation. + * - sparse + - Create new files as sparse. -=============================================================================== + * - showmeta + - Use this parameter to show all meta-files (System Files) on a mounted + NTFS partition. By default, all meta-files are hidden. -ToDo list -========= + * - prealloc + - Preallocate space for files excessively when file size is increasing on + writes. Decreases fragmentation in case of parallel write operations to + different files. -- Full journaling support (currently journal replaying is supported) over JBD. + * - acl + - Support POSIX ACLs (Access Control Lists). Effective if supported by + Kernel. Not to be confused with NTFS ACLs. The option specified as acl + enables support for POSIX ACLs. +Todo list +========= +- Full journaling support over JBD. Currently journal replaying is supported + which is not necessarily as effectice as JBD would be. References ========== -https://www.paragon-software.com/home/ntfs-linux-professional/ - - Commercial version of the NTFS driver for Linux. +- Commercial version of the NTFS driver for Linux. + https://www.paragon-software.com/home/ntfs-linux-professional/ -almaz.alexandrovich@paragon-software.com - - Direct e-mail address for feedback and requests on the NTFS3 implementation. +- Direct e-mail address for feedback and requests on the NTFS3 implementation. + almaz.alexandrovich@paragon-software.com diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst index 364680cdad2e..8ba72e898099 100644 --- a/Documentation/gpu/amdgpu.rst +++ b/Documentation/gpu/amdgpu.rst @@ -300,8 +300,8 @@ pcie_replay_count .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c :doc: pcie_replay_count -+GPU SmartShift Information -============================ +GPU SmartShift Information +========================== GPU SmartShift information via sysfs diff --git a/Documentation/gpu/drm-internals.rst b/Documentation/gpu/drm-internals.rst index 06af044c882f..607f78f0f189 100644 --- a/Documentation/gpu/drm-internals.rst +++ b/Documentation/gpu/drm-internals.rst @@ -111,15 +111,6 @@ Component Helper Usage .. kernel-doc:: drivers/gpu/drm/drm_drv.c :doc: component helper usage recommendations -IRQ Helper Library -~~~~~~~~~~~~~~~~~~ - -.. kernel-doc:: drivers/gpu/drm/drm_irq.c - :doc: irq helpers - -.. kernel-doc:: drivers/gpu/drm/drm_irq.c - :export: - Memory Manager Initialization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst index 90bc3f51eda9..e6cd40663ea5 100644 --- a/Documentation/kernel-hacking/locking.rst +++ b/Documentation/kernel-hacking/locking.rst @@ -1352,7 +1352,19 @@ Mutex API reference Futex API reference =================== -.. kernel-doc:: kernel/futex.c +.. kernel-doc:: kernel/futex/core.c + :internal: + +.. kernel-doc:: kernel/futex/futex.h + :internal: + +.. kernel-doc:: kernel/futex/pi.c + :internal: + +.. kernel-doc:: kernel/futex/requeue.c + :internal: + +.. kernel-doc:: kernel/futex/waitwake.c :internal: Further reading diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst index a432dc419fa4..5d97cee9457b 100644 --- a/Documentation/networking/devlink/ice.rst +++ b/Documentation/networking/devlink/ice.rst @@ -30,10 +30,11 @@ The ``ice`` driver reports the following versions PHY, link, etc. * - ``fw.mgmt.api`` - running - - 1.5 - - 2-digit version number of the API exported over the AdminQ by the - management firmware. Used by the driver to identify what commands - are supported. + - 1.5.1 + - 3-digit version number (major.minor.patch) of the API exported over + the AdminQ by the management firmware. Used by the driver to + identify what commands are supported. Historical versions of the + kernel only displayed a 2-digit version number (major.minor). * - ``fw.mgmt.build`` - running - 0x305d955f diff --git a/Documentation/networking/mctp.rst b/Documentation/networking/mctp.rst index 6100cdc220f6..fa7730dbf7b9 100644 --- a/Documentation/networking/mctp.rst +++ b/Documentation/networking/mctp.rst @@ -59,11 +59,11 @@ specified with a ``sockaddr`` type, with a single-byte endpoint address: }; struct sockaddr_mctp { - unsigned short int smctp_family; - int smctp_network; - struct mctp_addr smctp_addr; - __u8 smctp_type; - __u8 smctp_tag; + __kernel_sa_family_t smctp_family; + unsigned int smctp_network; + struct mctp_addr smctp_addr; + __u8 smctp_type; + __u8 smctp_tag; }; #define MCTP_NET_ANY 0x0 diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst index 1efb8293bf1f..163f1bd4e857 100644 --- a/Documentation/translations/it_IT/kernel-hacking/locking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst @@ -1396,7 +1396,19 @@ Riferimento per l'API dei Mutex Riferimento per l'API dei Futex =============================== -.. kernel-doc:: kernel/futex.c +.. kernel-doc:: kernel/futex/core.c + :internal: + +.. kernel-doc:: kernel/futex/futex.h + :internal: + +.. kernel-doc:: kernel/futex/pi.c + :internal: + +.. kernel-doc:: kernel/futex/requeue.c + :internal: + +.. kernel-doc:: kernel/futex/waitwake.c :internal: Approfondimenti diff --git a/Documentation/userspace-api/futex2.rst b/Documentation/userspace-api/futex2.rst new file mode 100644 index 000000000000..9693f47a7e62 --- /dev/null +++ b/Documentation/userspace-api/futex2.rst @@ -0,0 +1,86 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +futex2 +====== + +:Author: AndrĂ© Almeida <andrealmeid@collabora.com> + +futex, or fast user mutex, is a set of syscalls to allow userspace to create +performant synchronization mechanisms, such as mutexes, semaphores and +conditional variables in userspace. C standard libraries, like glibc, uses it +as a means to implement more high level interfaces like pthreads. + +futex2 is a followup version of the initial futex syscall, designed to overcome +limitations of the original interface. + +User API +======== + +``futex_waitv()`` +----------------- + +Wait on an array of futexes, wake on any:: + + futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, + unsigned int flags, struct timespec *timeout, clockid_t clockid) + + struct futex_waitv { + __u64 val; + __u64 uaddr; + __u32 flags; + __u32 __reserved; + }; + +Userspace sets an array of struct futex_waitv (up to a max of 128 entries), +using ``uaddr`` for the address to wait for, ``val`` for the expected value +and ``flags`` to specify the type (e.g. private) and size of futex. +``__reserved`` needs to be 0, but it can be used for future extension. The +pointer for the first item of the array is passed as ``waiters``. An invalid +address for ``waiters`` or for any ``uaddr`` returns ``-EFAULT``. + +If userspace has 32-bit pointers, it should do a explicit cast to make sure +the upper bits are zeroed. ``uintptr_t`` does the tricky and it works for +both 32/64-bit pointers. + +``nr_futexes`` specifies the size of the array. Numbers out of [1, 128] +interval will make the syscall return ``-EINVAL``. + +The ``flags`` argument of the syscall needs to be 0, but it can be used for +future extension. + +For each entry in ``waiters`` array, the current value at ``uaddr`` is compared +to ``val``. If it's different, the syscall undo all the work done so far and +return ``-EAGAIN``. If all tests and verifications succeeds, syscall waits until +one of the following happens: + +- The timeout expires, returning ``-ETIMEOUT``. +- A signal was sent to the sleeping task, returning ``-ERESTARTSYS``. +- Some futex at the list was woken, returning the index of some waked futex. + +An example of how to use the interface can be found at ``tools/testing/selftests/futex/functional/futex_waitv.c``. + +Timeout +------- + +``struct timespec *timeout`` argument is an optional argument that points to an +absolute timeout. You need to specify the type of clock being used at +``clockid`` argument. ``CLOCK_MONOTONIC`` and ``CLOCK_REALTIME`` are supported. +This syscall accepts only 64bit timespec structs. + +Types of futex +-------------- + +A futex can be either private or shared. Private is used for processes that +shares the same memory space and the virtual address of the futex will be the +same for all processes. This allows for optimizations in the kernel. To use +private futexes, it's necessary to specify ``FUTEX_PRIVATE_FLAG`` in the futex +flag. For processes that doesn't share the same memory space and therefore can +have different virtual addresses for the same futex (using, for instance, a +file-backed shared memory) requires different internal mechanisms to be get +properly enqueued. This is the default behavior, and it works with both private +and shared futexes. + +Futexes can be of different sizes: 8, 16, 32 or 64 bits. Currently, the only +supported one is 32 bit sized futex, and it need to be specified using +``FUTEX_32`` flag. diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index c432be070f67..a61eac0c73f8 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -28,6 +28,7 @@ place where this information is gathered. media/index sysfs-platform_profile vduse + futex2 .. only:: subproject and html diff --git a/Documentation/userspace-api/ioctl/cdrom.rst b/Documentation/userspace-api/ioctl/cdrom.rst index 3b4c0506de46..682948fc88a3 100644 --- a/Documentation/userspace-api/ioctl/cdrom.rst +++ b/Documentation/userspace-api/ioctl/cdrom.rst @@ -13,61 +13,64 @@ in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c ioctl values are listed in <linux/cdrom.h>. As of this writing, they are as follows: - ====================== =============================================== - CDROMPAUSE Pause Audio Operation - CDROMRESUME Resume paused Audio Operation - CDROMPLAYMSF Play Audio MSF (struct cdrom_msf) - CDROMPLAYTRKIND Play Audio Track/index (struct cdrom_ti) - CDROMREADTOCHDR Read TOC header (struct cdrom_tochdr) - CDROMREADTOCENTRY Read TOC entry (struct cdrom_tocentry) - CDROMSTOP Stop the cdrom drive - CDROMSTART Start the cdrom drive - CDROMEJECT Ejects the cdrom media - CDROMVOLCTRL Control output volume (struct cdrom_volctrl) - CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl) - CDROMREADMODE2 Read CDROM mode 2 data (2336 Bytes) - (struct cdrom_read) - CDROMREADMODE1 Read CDROM mode 1 data (2048 Bytes) - (struct cdrom_read) - CDROMREADAUDIO (struct cdrom_read_audio) - CDROMEJECT_SW enable(1)/disable(0) auto-ejecting - CDROMMULTISESSION Obtain the start-of-last-session - address of multi session disks - (struct cdrom_multisession) - CDROM_GET_MCN Obtain the "Universal Product Code" - if available (struct cdrom_mcn) - CDROM_GET_UPC Deprecated, use CDROM_GET_MCN instead. - CDROMRESET hard-reset the drive - CDROMVOLREAD Get the drive's volume setting - (struct cdrom_volctrl) - CDROMREADRAW read data in raw mode (2352 Bytes) - (struct cdrom_read) - CDROMREADCOOKED read data in cooked mode - CDROMSEEK seek msf address - CDROMPLAYBLK scsi-cd only, (struct cdrom_blk) - CDROMREADALL read all 2646 bytes - CDROMGETSPINDOWN return 4-bit spindown value - CDROMSETSPINDOWN set 4-bit spindown value - CDROMCLOSETRAY pendant of CDROMEJECT - CDROM_SET_OPTIONS Set behavior options - CDROM_CLEAR_OPTIONS Clear behavior options - CDROM_SELECT_SPEED Set the CD-ROM speed - CDROM_SELECT_DISC Select disc (for juke-boxes) - CDROM_MEDIA_CHANGED Check is media changed - CDROM_DRIVE_STATUS Get tray position, etc. - CDROM_DISC_STATUS Get disc type, etc. - CDROM_CHANGER_NSLOTS Get number of slots - CDROM_LOCKDOOR lock or unlock door - CDROM_DEBUG Turn debug messages on/off - CDROM_GET_CAPABILITY get capabilities - CDROMAUDIOBUFSIZ set the audio buffer size - DVD_READ_STRUCT Read structure - DVD_WRITE_STRUCT Write structure - DVD_AUTH Authentication - CDROM_SEND_PACKET send a packet to the drive - CDROM_NEXT_WRITABLE get next writable block - CDROM_LAST_WRITTEN get last block written on disc - ====================== =============================================== + ======================== =============================================== + CDROMPAUSE Pause Audio Operation + CDROMRESUME Resume paused Audio Operation + CDROMPLAYMSF Play Audio MSF (struct cdrom_msf) + CDROMPLAYTRKIND Play Audio Track/index (struct cdrom_ti) + CDROMREADTOCHDR Read TOC header (struct cdrom_tochdr) + CDROMREADTOCENTRY Read TOC entry (struct cdrom_tocentry) + CDROMSTOP Stop the cdrom drive + CDROMSTART Start the cdrom drive + CDROMEJECT Ejects the cdrom media + CDROMVOLCTRL Control output volume (struct cdrom_volctrl) + CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl) + CDROMREADMODE2 Read CDROM mode 2 data (2336 Bytes) + (struct cdrom_read) + CDROMREADMODE1 Read CDROM mode 1 data (2048 Bytes) + (struct cdrom_read) + CDROMREADAUDIO (struct cdrom_read_audio) + CDROMEJECT_SW enable(1)/disable(0) auto-ejecting + CDROMMULTISESSION Obtain the start-of-last-session + address of multi session disks + (struct cdrom_multisession) + CDROM_GET_MCN Obtain the "Universal Product Code" + if available (struct cdrom_mcn) + CDROM_GET_UPC Deprecated, use CDROM_GET_MCN instead. + CDROMRESET hard-reset the drive + CDROMVOLREAD Get the drive's volume setting + (struct cdrom_volctrl) + CDROMREADRAW read data in raw mode (2352 Bytes) + (struct cdrom_read) + CDROMREADCOOKED read data in cooked mode + CDROMSEEK seek msf address + CDROMPLAYBLK scsi-cd only, (struct cdrom_blk) + CDROMREADALL read all 2646 bytes + CDROMGETSPINDOWN return 4-bit spindown value + CDROMSETSPINDOWN set 4-bit spindown value + CDROMCLOSETRAY pendant of CDROMEJECT + CDROM_SET_OPTIONS Set behavior options + CDROM_CLEAR_OPTIONS Clear behavior options + CDROM_SELECT_SPEED Set the CD-ROM speed + CDROM_SELECT_DISC Select disc (for juke-boxes) + CDROM_MEDIA_CHANGED Check is media changed + CDROM_TIMED_MEDIA_CHANGE Check if media changed + since given time + (struct cdrom_timed_media_change_info) + CDROM_DRIVE_STATUS Get tray position, etc. + CDROM_DISC_STATUS Get disc type, etc. + CDROM_CHANGER_NSLOTS Get number of slots + CDROM_LOCKDOOR lock or unlock door + CDROM_DEBUG Turn debug messages on/off + CDROM_GET_CAPABILITY get capabilities + CDROMAUDIOBUFSIZ set the audio buffer size + DVD_READ_STRUCT Read structure + DVD_WRITE_STRUCT Write structure + DVD_AUTH Authentication + CDROM_SEND_PACKET send a packet to the drive + CDROM_NEXT_WRITABLE get next writable block + CDROM_LAST_WRITTEN get last block written on disc + ======================== =============================================== The information that follows was determined from reading kernel source diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 2e8134059c87..6655d929a351 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -104,6 +104,7 @@ Code Seq# Include File Comments '8' all SNP8023 advanced NIC card <mailto:mcr@solidum.com> ';' 64-7F linux/vfio.h +'=' 00-3f uapi/linux/ptp_clock.h <mailto:richardcochran@gmail.com> '@' 00-0F linux/radeonfb.h conflict! '@' 00-0F drivers/video/aty/aty128fb.c conflict! 'A' 00-1F linux/apm_bios.h conflict! diff --git a/Documentation/userspace-api/vduse.rst b/Documentation/userspace-api/vduse.rst index 42ef59ea5314..bdb880e01132 100644 --- a/Documentation/userspace-api/vduse.rst +++ b/Documentation/userspace-api/vduse.rst @@ -18,7 +18,7 @@ types can be added after the security issue of corresponding device driver is clarified or fixed in the future. Create/Destroy VDUSE devices ------------------------- +---------------------------- VDUSE devices are created as follows: diff --git a/MAINTAINERS b/MAINTAINERS index abdcbcfef73d..96a96b1529dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1276,6 +1276,7 @@ F: drivers/input/mouse/bcm5974.c APPLE DART IOMMU DRIVER M: Sven Peter <sven@svenpeter.dev> +R: Alyssa Rosenzweig <alyssa@rosenzweig.io> L: iommu@lists.linux-foundation.org S: Maintained F: Documentation/devicetree/bindings/iommu/apple,dart.yaml @@ -1551,7 +1552,7 @@ ARM PRIMECELL VIC PL190/PL192 DRIVER M: Linus Walleij <linus.walleij@linaro.org> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -F: Documentation/devicetree/bindings/interrupt-controller/arm,vic.txt +F: Documentation/devicetree/bindings/interrupt-controller/arm,vic.yaml F: drivers/irqchip/irq-vic.c ARM SMC WATCHDOG DRIVER @@ -1712,6 +1713,8 @@ F: drivers/*/*alpine* ARM/APPLE MACHINE SUPPORT M: Hector Martin <marcan@marcan.st> +M: Sven Peter <sven@svenpeter.dev> +R: Alyssa Rosenzweig <alyssa@rosenzweig.io> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained W: https://asahilinux.org @@ -2237,6 +2240,7 @@ F: arch/arm/mach-pxa/mioa701.c ARM/MStar/Sigmastar Armv7 SoC support M: Daniel Palmer <daniel@thingy.jp> +M: Romain Perier <romain.perier@gmail.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained W: http://linux-chenxing.org/ @@ -2713,6 +2717,7 @@ F: drivers/power/reset/keystone-reset.c ARM/TEXAS INSTRUMENTS K3 ARCHITECTURE M: Nishanth Menon <nm@ti.com> +M: Vignesh Raghavendra <vigneshr@ti.com> M: Tero Kristo <kristo@kernel.org> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported @@ -5453,6 +5458,19 @@ F: include/net/devlink.h F: include/uapi/linux/devlink.h F: net/core/devlink.c +DH ELECTRONICS IMX6 DHCOM BOARD SUPPORT +M: Christoph Niedermaier <cniedermaier@dh-electronics.com> +L: kernel@dh-electronics.com +S: Maintained +F: arch/arm/boot/dts/imx6*-dhcom-* + +DH ELECTRONICS STM32MP1 DHCOM/DHCOR BOARD SUPPORT +M: Marek Vasut <marex@denx.de> +L: kernel@dh-electronics.com +S: Maintained +F: arch/arm/boot/dts/stm32mp1*-dhcom-* +F: arch/arm/boot/dts/stm32mp1*-dhcor-* + DIALOG SEMICONDUCTOR DRIVERS M: Support Opensource <support.opensource@diasemi.com> S: Supported @@ -6142,8 +6160,7 @@ T: git git://anongit.freedesktop.org/drm/drm F: Documentation/devicetree/bindings/display/ F: Documentation/devicetree/bindings/gpu/ F: Documentation/gpu/ -F: drivers/gpu/drm/ -F: drivers/gpu/vga/ +F: drivers/gpu/ F: include/drm/ F: include/linux/vga* F: include/uapi/drm/ @@ -7338,10 +7355,11 @@ F: include/uapi/linux/fpga-dfl.h FPGA MANAGER FRAMEWORK M: Moritz Fischer <mdf@kernel.org> +M: Wu Hao <hao.wu@intel.com> +M: Xu Yilun <yilun.xu@intel.com> R: Tom Rix <trix@redhat.com> L: linux-fpga@vger.kernel.org S: Maintained -W: http://www.rocketboards.org Q: http://patchwork.kernel.org/project/linux-fpga/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mdf/linux-fpga.git F: Documentation/devicetree/bindings/fpga/ @@ -7435,7 +7453,7 @@ FREESCALE IMX / MXC FEC DRIVER M: Joakim Zhang <qiangqing.zhang@nxp.com> L: netdev@vger.kernel.org S: Maintained -F: Documentation/devicetree/bindings/net/fsl-fec.txt +F: Documentation/devicetree/bindings/net/fsl,fec.yaml F: drivers/net/ethernet/freescale/fec.h F: drivers/net/ethernet/freescale/fec_main.c F: drivers/net/ethernet/freescale/fec_ptp.c @@ -7719,6 +7737,7 @@ M: Ingo Molnar <mingo@redhat.com> R: Peter Zijlstra <peterz@infradead.org> R: Darren Hart <dvhart@infradead.org> R: Davidlohr Bueso <dave@stgolabs.net> +R: AndrĂ© Almeida <andrealmeid@collabora.com> L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core @@ -7726,7 +7745,7 @@ F: Documentation/locking/*futex* F: include/asm-generic/futex.h F: include/linux/futex.h F: include/uapi/linux/futex.h -F: kernel/futex.c +F: kernel/futex/* F: tools/perf/bench/futex* F: tools/testing/selftests/futex/ @@ -8609,9 +8628,8 @@ F: Documentation/devicetree/bindings/iio/humidity/st,hts221.yaml F: drivers/iio/humidity/hts221* HUAWEI ETHERNET DRIVER -M: Bin Luo <luobin9@huawei.com> L: netdev@vger.kernel.org -S: Supported +S: Orphan F: Documentation/networking/device_drivers/ethernet/huawei/hinic.rst F: drivers/net/ethernet/huawei/hinic/ @@ -9303,7 +9321,7 @@ S: Maintained F: drivers/platform/x86/intel/atomisp2/led.c INTEL BIOS SAR INT1092 DRIVER -M: Shravan S <s.shravan@intel.com> +M: Shravan Sudhakar <s.shravan@intel.com> M: Intel Corporation <linuxwwan@intel.com> L: platform-driver-x86@vger.kernel.org S: Maintained @@ -9625,7 +9643,7 @@ F: include/uapi/linux/isst_if.h F: tools/power/x86/intel-speed-select/ INTEL STRATIX10 FIRMWARE DRIVERS -M: Richard Gong <richard.gong@linux.intel.com> +M: Dinh Nguyen <dinguyen@kernel.org> L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/ABI/testing/sysfs-devices-platform-stratix10-rsu @@ -10275,7 +10293,6 @@ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) M: Christian Borntraeger <borntraeger@de.ibm.com> M: Janosch Frank <frankja@linux.ibm.com> R: David Hildenbrand <david@redhat.com> -R: Cornelia Huck <cohuck@redhat.com> R: Claudio Imbrenda <imbrenda@linux.ibm.com> L: kvm@vger.kernel.org S: Supported @@ -11149,6 +11166,7 @@ S: Maintained F: Documentation/devicetree/bindings/net/dsa/marvell.txt F: Documentation/networking/devlink/mv88e6xxx.rst F: drivers/net/dsa/mv88e6xxx/ +F: include/linux/dsa/mv88e6xxx.h F: include/linux/platform_data/mv88e6xxx.h MARVELL ARMADA 3700 PHY DRIVERS @@ -11273,7 +11291,6 @@ F: Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst F: drivers/net/ethernet/marvell/octeontx2/af/ MARVELL PRESTERA ETHERNET SWITCH DRIVER -M: Vadym Kochan <vkochan@marvell.com> M: Taras Chornyi <tchornyi@marvell.com> S: Supported W: https://github.com/Marvell-switching/switchdev-prestera @@ -12254,6 +12271,12 @@ L: linux-crypto@vger.kernel.org S: Maintained F: drivers/crypto/atmel-ecc.* +MICROCHIP EIC DRIVER +M: Claudiu Beznea <claudiu.beznea@microchip.com> +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +S: Supported +F: drivers/irqchip/irq-mchp-eic.c + MICROCHIP I2C DRIVER M: Codrin Ciubotariu <codrin.ciubotariu@microchip.com> L: linux-i2c@vger.kernel.org @@ -16297,6 +16320,7 @@ S390 M: Heiko Carstens <hca@linux.ibm.com> M: Vasily Gorbik <gor@linux.ibm.com> M: Christian Borntraeger <borntraeger@de.ibm.com> +R: Alexander Gordeev <agordeev@linux.ibm.com> L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ @@ -16375,7 +16399,6 @@ F: drivers/s390/crypto/vfio_ap_ops.c F: drivers/s390/crypto/vfio_ap_private.h S390 VFIO-CCW DRIVER -M: Cornelia Huck <cohuck@redhat.com> M: Eric Farman <farman@linux.ibm.com> M: Matthew Rosato <mjrosato@linux.ibm.com> R: Halil Pasic <pasic@linux.ibm.com> @@ -17794,7 +17817,6 @@ F: drivers/staging/nvec/ STAGING - OLPC SECONDARY DISPLAY CONTROLLER (DCON) M: Jens Frederich <jfrederich@gmail.com> -M: Daniel Drake <dsd@laptop.org> M: Jon Nettleton <jon.nettleton@gmail.com> S: Maintained W: http://wiki.laptop.org/go/DCON @@ -17983,7 +18005,7 @@ F: net/switchdev/ SY8106A REGULATOR DRIVER M: Icenowy Zheng <icenowy@aosc.io> S: Maintained -F: Documentation/devicetree/bindings/regulator/sy8106a-regulator.txt +F: Documentation/devicetree/bindings/regulator/silergy,sy8106a.yaml F: drivers/regulator/sy8106a-regulator.c SYNC FILE FRAMEWORK @@ -20332,6 +20354,7 @@ X86 ARCHITECTURE (32-BIT AND 64-BIT) M: Thomas Gleixner <tglx@linutronix.de> M: Ingo Molnar <mingo@redhat.com> M: Borislav Petkov <bp@alien8.de> +M: Dave Hansen <dave.hansen@linux.intel.com> M: x86@kernel.org R: "H. Peter Anvin" <hpa@zytor.com> L: linux-kernel@vger.kernel.org @@ -20700,7 +20723,6 @@ S: Maintained F: mm/zbud.c ZD1211RW WIRELESS DRIVER -M: Daniel Drake <dsd@gentoo.org> M: Ulrich Kunitz <kune@deine-taler.de> L: linux-wireless@vger.kernel.org L: zd1211-devs@lists.sourceforge.net (subscribers-only) @@ -2,8 +2,8 @@ VERSION = 5 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc4 -NAME = Opossums on Parade +EXTRAVERSION = +NAME = Trick or Treat # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -1115,7 +1115,8 @@ export MODORDER := $(extmod_prefix)modules.order export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps ifeq ($(KBUILD_EXTMOD),) -core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ +core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ +core-$(CONFIG_BLOCK) += block/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 3a5a80f302e1..b4ae6058902a 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -40,7 +40,6 @@ config ARC select HAVE_KRETPROBES select HAVE_MOD_ARCH_SPECIFIC select HAVE_PERF_EVENTS - select HANDLE_DOMAIN_IRQ select IRQ_DOMAIN select MODULES_USE_ELF_RELA select OF diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h index e201b4b1655a..e8c2c7469e10 100644 --- a/arch/arc/include/asm/cacheflush.h +++ b/arch/arc/include/asm/cacheflush.h @@ -36,6 +36,7 @@ void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); void dma_cache_wback_inv(phys_addr_t start, unsigned long sz); void dma_cache_inv(phys_addr_t start, unsigned long sz); diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 9320b04c04bf..4cf45a99fd79 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -26,11 +26,6 @@ extern char empty_zero_page[PAGE_SIZE]; extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); -/* Macro to mark a page protection as uncacheable */ -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE)) - -extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); - /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/arc/kernel/irq.c b/arch/arc/kernel/irq.c index ef909dd4b40c..dd09b58ff82d 100644 --- a/arch/arc/kernel/irq.c +++ b/arch/arc/kernel/irq.c @@ -6,6 +6,8 @@ #include <linux/interrupt.h> #include <linux/irqchip.h> #include <asm/mach_desc.h> + +#include <asm/irq_regs.h> #include <asm/smp.h> /* @@ -39,5 +41,11 @@ void __init init_IRQ(void) */ void arch_do_IRQ(unsigned int hwirq, struct pt_regs *regs) { - handle_domain_irq(NULL, hwirq, regs); + struct pt_regs *old_regs; + + irq_enter(); + old_regs = set_irq_regs(regs); + generic_handle_domain_irq(NULL, hwirq); + set_irq_regs(old_regs); + irq_exit(); } diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fc196421b2ce..2c056c0e834e 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -64,7 +64,6 @@ config ARM select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD - select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 @@ -92,6 +91,7 @@ config ARM select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL + select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) select HAVE_IRQ_TIME_ACCOUNTING @@ -1989,8 +1989,6 @@ config ARCH_HIBERNATION_POSSIBLE endmenu -source "drivers/firmware/Kconfig" - if CRYPTO source "arch/arm/crypto/Kconfig" endif diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index aa075d8372ea..74255e819831 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -47,7 +47,10 @@ extern char * strchrnul(const char *, int); #endif #ifdef CONFIG_KERNEL_XZ +/* Prevent KASAN override of string helpers in decompressor */ +#undef memmove #define memmove memmove +#undef memcpy #define memcpy memcpy #include "../../../../lib/decompress_unxz.c" #endif diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index 614999dcb990..cd4672501add 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -71,7 +71,6 @@ isc: isc@f0008000 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_isc_base &pinctrl_isc_data_8bit &pinctrl_isc_data_9_10 &pinctrl_isc_data_11_12>; - status = "okay"; }; qspi1: spi@f0024000 { diff --git a/arch/arm/boot/dts/at91-sama7g5ek.dts b/arch/arm/boot/dts/at91-sama7g5ek.dts index 4cbed98cc2f4..f3d6aaa3a78d 100644 --- a/arch/arm/boot/dts/at91-sama7g5ek.dts +++ b/arch/arm/boot/dts/at91-sama7g5ek.dts @@ -196,11 +196,13 @@ regulator-state-standby { regulator-on-in-suspend; + regulator-suspend-microvolt = <1350000>; regulator-mode = <4>; }; regulator-state-mem { regulator-on-in-suspend; + regulator-suspend-microvolt = <1350000>; regulator-mode = <4>; }; }; @@ -353,7 +355,10 @@ #address-cells = <1>; #size-cells = <0>; pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_gmac0_default &pinctrl_gmac0_txck_default &pinctrl_gmac0_phy_irq>; + pinctrl-0 = <&pinctrl_gmac0_default + &pinctrl_gmac0_mdio_default + &pinctrl_gmac0_txck_default + &pinctrl_gmac0_phy_irq>; phy-mode = "rgmii-id"; status = "okay"; @@ -368,7 +373,9 @@ #address-cells = <1>; #size-cells = <0>; pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_gmac1_default &pinctrl_gmac1_phy_irq>; + pinctrl-0 = <&pinctrl_gmac1_default + &pinctrl_gmac1_mdio_default + &pinctrl_gmac1_phy_irq>; phy-mode = "rmii"; status = "okay"; @@ -423,14 +430,20 @@ <PIN_PA15__G0_TXEN>, <PIN_PA30__G0_RXCK>, <PIN_PA18__G0_RXDV>, - <PIN_PA22__G0_MDC>, - <PIN_PA23__G0_MDIO>, <PIN_PA25__G0_125CK>; + slew-rate = <0>; + bias-disable; + }; + + pinctrl_gmac0_mdio_default: gmac0_mdio_default { + pinmux = <PIN_PA22__G0_MDC>, + <PIN_PA23__G0_MDIO>; bias-disable; }; pinctrl_gmac0_txck_default: gmac0_txck_default { pinmux = <PIN_PA24__G0_TXCK>; + slew-rate = <0>; bias-pull-up; }; @@ -447,8 +460,13 @@ <PIN_PD25__G1_RX0>, <PIN_PD26__G1_RX1>, <PIN_PD27__G1_RXER>, - <PIN_PD24__G1_RXDV>, - <PIN_PD28__G1_MDC>, + <PIN_PD24__G1_RXDV>; + slew-rate = <0>; + bias-disable; + }; + + pinctrl_gmac1_mdio_default: gmac1_mdio_default { + pinmux = <PIN_PD28__G1_MDC>, <PIN_PD29__G1_MDIO>; bias-disable; }; @@ -540,6 +558,7 @@ <PIN_PA8__SDMMC0_DAT5>, <PIN_PA9__SDMMC0_DAT6>, <PIN_PA10__SDMMC0_DAT7>; + slew-rate = <0>; bias-pull-up; }; @@ -547,6 +566,7 @@ pinmux = <PIN_PA0__SDMMC0_CK>, <PIN_PA2__SDMMC0_RSTN>, <PIN_PA11__SDMMC0_DS>; + slew-rate = <0>; bias-pull-up; }; }; @@ -558,6 +578,7 @@ <PIN_PC0__SDMMC1_DAT1>, <PIN_PC1__SDMMC1_DAT2>, <PIN_PC2__SDMMC1_DAT3>; + slew-rate = <0>; bias-pull-up; }; @@ -566,6 +587,7 @@ <PIN_PB28__SDMMC1_RSTN>, <PIN_PC5__SDMMC1_1V8SEL>, <PIN_PC4__SDMMC1_CD>; + slew-rate = <0>; bias-pull-up; }; }; @@ -577,11 +599,13 @@ <PIN_PD6__SDMMC2_DAT1>, <PIN_PD7__SDMMC2_DAT2>, <PIN_PD8__SDMMC2_DAT3>; + slew-rate = <0>; bias-pull-up; }; ck { pinmux = <PIN_PD4__SDMMC2_CK>; + slew-rate = <0>; bias-pull-up; }; }; @@ -634,6 +658,15 @@ pinctrl-0 = <&pinctrl_sdmmc2_default>; }; +&shdwc { + atmel,shdwc-debouncer = <976>; + status = "okay"; + + input@0 { + reg = <0>; + }; +}; + &spdifrx { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_spdifrx_default>; diff --git a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts index f24bdd0870a5..72ce80fbf266 100644 --- a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts +++ b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts @@ -40,8 +40,8 @@ regulator-always-on; regulator-settling-time-us = <5000>; gpios = <&expgpio 4 GPIO_ACTIVE_HIGH>; - states = <1800000 0x1 - 3300000 0x0>; + states = <1800000 0x1>, + <3300000 0x0>; status = "okay"; }; @@ -217,15 +217,16 @@ }; &pcie0 { - pci@1,0 { + pci@0,0 { + device_type = "pci"; #address-cells = <3>; #size-cells = <2>; ranges; reg = <0 0 0 0 0>; - usb@1,0 { - reg = <0x10000 0 0 0 0>; + usb@0,0 { + reg = <0 0 0 0 0>; resets = <&reset RASPBERRYPI_FIRMWARE_RESET_ID_USB>; }; }; diff --git a/arch/arm/boot/dts/bcm2711.dtsi b/arch/arm/boot/dts/bcm2711.dtsi index b8a4096192aa..3b60297af7f6 100644 --- a/arch/arm/boot/dts/bcm2711.dtsi +++ b/arch/arm/boot/dts/bcm2711.dtsi @@ -300,6 +300,14 @@ status = "disabled"; }; + vec: vec@7ec13000 { + compatible = "brcm,bcm2711-vec"; + reg = <0x7ec13000 0x1000>; + clocks = <&clocks BCM2835_CLOCK_VEC>; + interrupts = <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + dvp: clock@7ef00000 { compatible = "brcm,brcm2711-dvp"; reg = <0x7ef00000 0x10>; @@ -532,8 +540,8 @@ compatible = "brcm,genet-mdio-v5"; reg = <0xe14 0x8>; reg-names = "mdio"; - #address-cells = <0x0>; - #size-cells = <0x1>; + #address-cells = <0x1>; + #size-cells = <0x0>; }; }; }; diff --git a/arch/arm/boot/dts/bcm2835-common.dtsi b/arch/arm/boot/dts/bcm2835-common.dtsi index 4119271c979d..c25e797b9060 100644 --- a/arch/arm/boot/dts/bcm2835-common.dtsi +++ b/arch/arm/boot/dts/bcm2835-common.dtsi @@ -106,6 +106,14 @@ status = "okay"; }; + vec: vec@7e806000 { + compatible = "brcm,bcm2835-vec"; + reg = <0x7e806000 0x1000>; + clocks = <&clocks BCM2835_CLOCK_VEC>; + interrupts = <2 27>; + status = "disabled"; + }; + pixelvalve@7e807000 { compatible = "brcm,bcm2835-pixelvalve2"; reg = <0x7e807000 0x100>; diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi index 0f3be55201a5..a3e06b680947 100644 --- a/arch/arm/boot/dts/bcm283x.dtsi +++ b/arch/arm/boot/dts/bcm283x.dtsi @@ -464,14 +464,6 @@ status = "disabled"; }; - vec: vec@7e806000 { - compatible = "brcm,bcm2835-vec"; - reg = <0x7e806000 0x1000>; - clocks = <&clocks BCM2835_CLOCK_VEC>; - interrupts = <2 27>; - status = "disabled"; - }; - usb: usb@7e980000 { compatible = "brcm,bcm2835-usb"; reg = <0x7e980000 0x10000>; diff --git a/arch/arm/boot/dts/imx53-m53menlo.dts b/arch/arm/boot/dts/imx53-m53menlo.dts index d3082b9774e4..4f88e96d81dd 100644 --- a/arch/arm/boot/dts/imx53-m53menlo.dts +++ b/arch/arm/boot/dts/imx53-m53menlo.dts @@ -56,6 +56,7 @@ panel { compatible = "edt,etm0700g0dh6"; pinctrl-0 = <&pinctrl_display_gpio>; + pinctrl-names = "default"; enable-gpios = <&gpio6 0 GPIO_ACTIVE_HIGH>; port { @@ -76,8 +77,7 @@ regulator-name = "vbus"; regulator-min-microvolt = <5000000>; regulator-max-microvolt = <5000000>; - gpio = <&gpio1 2 GPIO_ACTIVE_HIGH>; - enable-active-high; + gpio = <&gpio1 2 0>; }; }; diff --git a/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi b/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi index cb8b539eb29d..e5c4dc65fbab 100644 --- a/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi +++ b/arch/arm/boot/dts/imx6dl-yapp4-common.dtsi @@ -5,6 +5,7 @@ #include <dt-bindings/gpio/gpio.h> #include <dt-bindings/interrupt-controller/irq.h> #include <dt-bindings/input/input.h> +#include <dt-bindings/leds/common.h> #include <dt-bindings/pwm/pwm.h> / { @@ -277,6 +278,7 @@ led-cur = /bits/ 8 <0x20>; max-cur = /bits/ 8 <0x60>; reg = <0>; + color = <LED_COLOR_ID_RED>; }; chan@1 { @@ -284,6 +286,7 @@ led-cur = /bits/ 8 <0x20>; max-cur = /bits/ 8 <0x60>; reg = <1>; + color = <LED_COLOR_ID_GREEN>; }; chan@2 { @@ -291,6 +294,7 @@ led-cur = /bits/ 8 <0x20>; max-cur = /bits/ 8 <0x60>; reg = <2>; + color = <LED_COLOR_ID_BLUE>; }; chan@3 { @@ -298,6 +302,7 @@ led-cur = /bits/ 8 <0x0>; max-cur = /bits/ 8 <0x0>; reg = <3>; + color = <LED_COLOR_ID_WHITE>; }; }; diff --git a/arch/arm/boot/dts/imx6qdl-pico.dtsi b/arch/arm/boot/dts/imx6qdl-pico.dtsi index 5de4ccb97916..f7a56d6b160c 100644 --- a/arch/arm/boot/dts/imx6qdl-pico.dtsi +++ b/arch/arm/boot/dts/imx6qdl-pico.dtsi @@ -176,7 +176,18 @@ pinctrl-0 = <&pinctrl_enet>; phy-mode = "rgmii-id"; phy-reset-gpios = <&gpio1 26 GPIO_ACTIVE_LOW>; + phy-handle = <&phy>; status = "okay"; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + phy: ethernet-phy@1 { + reg = <1>; + qca,clk-out-frequency = <125000000>; + }; + }; }; &hdmi { diff --git a/arch/arm/boot/dts/imx6sx-sdb.dts b/arch/arm/boot/dts/imx6sx-sdb.dts index 5a63ca615722..99f4cf777a38 100644 --- a/arch/arm/boot/dts/imx6sx-sdb.dts +++ b/arch/arm/boot/dts/imx6sx-sdb.dts @@ -114,7 +114,7 @@ compatible = "micron,n25q256a", "jedec,spi-nor"; spi-max-frequency = <29000000>; spi-rx-bus-width = <4>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; reg = <0>; }; @@ -124,7 +124,7 @@ compatible = "micron,n25q256a", "jedec,spi-nor"; spi-max-frequency = <29000000>; spi-rx-bus-width = <4>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; reg = <2>; }; }; diff --git a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi index 779cc536566d..a3fde3316c73 100644 --- a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi +++ b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi @@ -292,7 +292,7 @@ compatible = "micron,n25q256a", "jedec,spi-nor"; spi-max-frequency = <29000000>; spi-rx-bus-width = <4>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; reg = <0>; }; }; diff --git a/arch/arm/boot/dts/omap3430-sdp.dts b/arch/arm/boot/dts/omap3430-sdp.dts index c5b903718414..7d530ae3483b 100644 --- a/arch/arm/boot/dts/omap3430-sdp.dts +++ b/arch/arm/boot/dts/omap3430-sdp.dts @@ -101,7 +101,7 @@ nand@1,0 { compatible = "ti,omap2-nand"; - reg = <0 0 4>; /* CS0, offset 0, IO size 4 */ + reg = <1 0 4>; /* CS1, offset 0, IO size 4 */ interrupt-parent = <&gpmc>; interrupts = <0 IRQ_TYPE_NONE>, /* fifoevent */ <1 IRQ_TYPE_NONE>; /* termcount */ diff --git a/arch/arm/boot/dts/qcom-apq8064.dtsi b/arch/arm/boot/dts/qcom-apq8064.dtsi index 0b2bed6e7adf..d1c1c6aab2b8 100644 --- a/arch/arm/boot/dts/qcom-apq8064.dtsi +++ b/arch/arm/boot/dts/qcom-apq8064.dtsi @@ -198,7 +198,7 @@ clock-frequency = <19200000>; }; - pxo_board { + pxo_board: pxo_board { compatible = "fixed-clock"; #clock-cells = <0>; clock-frequency = <27000000>; @@ -1148,22 +1148,21 @@ }; gpu: adreno-3xx@4300000 { - compatible = "qcom,adreno-3xx"; + compatible = "qcom,adreno-320.2", "qcom,adreno"; reg = <0x04300000 0x20000>; reg-names = "kgsl_3d0_reg_memory"; interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "kgsl_3d0_irq"; clock-names = - "core_clk", - "iface_clk", - "mem_clk", - "mem_iface_clk"; + "core", + "iface", + "mem", + "mem_iface"; clocks = <&mmcc GFX3D_CLK>, <&mmcc GFX3D_AHB_CLK>, <&mmcc GFX3D_AXI_CLK>, <&mmcc MMSS_IMEM_AHB_CLK>; - qcom,chipid = <0x03020002>; iommus = <&gfx3d 0 &gfx3d 1 @@ -1306,7 +1305,7 @@ reg-names = "dsi_pll", "dsi_phy", "dsi_phy_regulator"; clock-names = "iface_clk", "ref"; clocks = <&mmcc DSI_M_AHB_CLK>, - <&cxo_board>; + <&pxo_board>; }; diff --git a/arch/arm/boot/dts/sama7g5.dtsi b/arch/arm/boot/dts/sama7g5.dtsi index cc6be6db7b80..6c58c151c6d9 100644 --- a/arch/arm/boot/dts/sama7g5.dtsi +++ b/arch/arm/boot/dts/sama7g5.dtsi @@ -75,6 +75,17 @@ #size-cells = <1>; ranges; + securam: securam@e0000000 { + compatible = "microchip,sama7g5-securam", "atmel,sama5d2-securam", "mmio-sram"; + reg = <0xe0000000 0x4000>; + clocks = <&pmc PMC_TYPE_PERIPHERAL 18>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0xe0000000 0x4000>; + no-memory-wc; + status = "okay"; + }; + secumod: secumod@e0004000 { compatible = "microchip,sama7g5-secumod", "atmel,sama5d2-secumod", "syscon"; reg = <0xe0004000 0x4000>; @@ -111,6 +122,17 @@ clock-names = "td_slck", "md_slck", "main_xtal"; }; + shdwc: shdwc@e001d010 { + compatible = "microchip,sama7g5-shdwc", "syscon"; + reg = <0xe001d010 0x10>; + clocks = <&clk32k 0>; + #address-cells = <1>; + #size-cells = <0>; + atmel,wakeup-rtc-timer; + atmel,wakeup-rtt-timer; + status = "disabled"; + }; + rtt: rtt@e001d020 { compatible = "microchip,sama7g5-rtt", "microchip,sam9x60-rtt", "atmel,at91sam9260-rtt"; reg = <0xe001d020 0x30>; @@ -137,6 +159,11 @@ clocks = <&clk32k 0>; }; + chipid@e0020000 { + compatible = "microchip,sama7g5-chipid"; + reg = <0xe0020000 0x8>; + }; + sdmmc0: mmc@e1204000 { compatible = "microchip,sama7g5-sdhci", "microchip,sam9x60-sdhci"; reg = <0xe1204000 0x4000>; @@ -515,6 +542,18 @@ }; }; + uddrc: uddrc@e3800000 { + compatible = "microchip,sama7g5-uddrc"; + reg = <0xe3800000 0x4000>; + status = "okay"; + }; + + ddr3phy: ddr3phy@e3804000 { + compatible = "microchip,sama7g5-ddr3phy"; + reg = <0xe3804000 0x1000>; + status = "okay"; + }; + gic: interrupt-controller@e8c11000 { compatible = "arm,cortex-a7-gic"; #interrupt-cells = <3>; diff --git a/arch/arm/boot/dts/spear3xx.dtsi b/arch/arm/boot/dts/spear3xx.dtsi index f266b7b03482..cc88ebe7a60c 100644 --- a/arch/arm/boot/dts/spear3xx.dtsi +++ b/arch/arm/boot/dts/spear3xx.dtsi @@ -47,7 +47,7 @@ }; gmac: eth@e0800000 { - compatible = "st,spear600-gmac"; + compatible = "snps,dwmac-3.40a"; reg = <0xe0800000 0x8000>; interrupts = <23 22>; interrupt-names = "macirq", "eth_wake_irq"; diff --git a/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts b/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts index 8077f1716fbc..ecb91fb899ff 100644 --- a/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts +++ b/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts @@ -112,7 +112,7 @@ pinctrl-names = "default"; pinctrl-0 = <&gmac_rgmii_pins>; phy-handle = <&phy1>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; status = "okay"; }; diff --git a/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi b/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi index 2ad9fd7c94ec..8af4b77fe655 100644 --- a/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi +++ b/arch/arm/boot/dts/vexpress-v2m-rs1.dtsi @@ -17,6 +17,7 @@ * TAKE CARE WHEN MAINTAINING THIS FILE TO PROPAGATE ANY RELEVANT * CHANGES TO vexpress-v2m.dtsi! */ +#include <dt-bindings/interrupt-controller/arm-gic.h> / { v2m_fixed_3v3: fixed-regulator-0 { @@ -101,16 +102,68 @@ }; bus@8000000 { - motherboard-bus { - model = "V2M-P1"; + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 63>; + interrupt-map = <0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <0 1 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <0 2 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, + <0 3 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, + <0 4 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, + <0 5 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, + <0 6 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, + <0 7 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, + <0 8 &gic GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, + <0 9 &gic GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, + <0 10 &gic GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>, + <0 11 &gic GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>, + <0 12 &gic GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>, + <0 13 &gic GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>, + <0 14 &gic GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>, + <0 15 &gic GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>, + <0 16 &gic GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>, + <0 17 &gic GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, + <0 18 &gic GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>, + <0 19 &gic GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>, + <0 20 &gic GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>, + <0 21 &gic GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>, + <0 22 &gic GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>, + <0 23 &gic GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>, + <0 24 &gic GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>, + <0 25 &gic GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>, + <0 26 &gic GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>, + <0 27 &gic GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, + <0 28 &gic GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>, + <0 29 &gic GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>, + <0 30 &gic GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>, + <0 31 &gic GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>, + <0 32 &gic GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, + <0 33 &gic GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, + <0 34 &gic GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, + <0 35 &gic GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, + <0 36 &gic GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, + <0 37 &gic GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>, + <0 38 &gic GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>, + <0 39 &gic GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>, + <0 40 &gic GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, + <0 41 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, + <0 42 &gic GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + + motherboard-bus@8000000 { arm,hbi = <0x190>; arm,vexpress,site = <0>; - arm,v2m-memory-map = "rs1"; compatible = "arm,vexpress,v2m-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; + ranges = <0 0 0x08000000 0x04000000>, + <1 0 0x14000000 0x04000000>, + <2 0 0x18000000 0x04000000>, + <3 0 0x1c000000 0x04000000>, + <4 0 0x0c000000 0x04000000>, + <5 0 0x10000000 0x04000000>; nor_flash: flash@0 { compatible = "arm,vexpress-flash", "cfi-flash"; @@ -215,7 +268,7 @@ clock-names = "apb_pclk"; }; - mmci@50000 { + mmc@50000 { compatible = "arm,pl180", "arm,primecell"; reg = <0x050000 0x1000>; interrupts = <9>, <10>; @@ -275,7 +328,7 @@ clock-names = "uartclk", "apb_pclk"; }; - wdt@f0000 { + watchdog@f0000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0f0000 0x1000>; interrupts = <0>; diff --git a/arch/arm/boot/dts/vexpress-v2m.dtsi b/arch/arm/boot/dts/vexpress-v2m.dtsi index ec13ceb9ed36..f434fe5cf4a1 100644 --- a/arch/arm/boot/dts/vexpress-v2m.dtsi +++ b/arch/arm/boot/dts/vexpress-v2m.dtsi @@ -17,18 +17,73 @@ * TAKE CARE WHEN MAINTAINING THIS FILE TO PROPAGATE ANY RELEVANT * CHANGES TO vexpress-v2m-rs1.dtsi! */ +#include <dt-bindings/interrupt-controller/arm-gic.h> / { - bus@4000000 { - motherboard { - model = "V2M-P1"; + bus@40000000 { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x40000000 0x40000000 0x10000000>, + <0x10000000 0x10000000 0x00020000>; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 63>; + interrupt-map = <0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <0 1 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <0 2 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, + <0 3 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, + <0 4 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, + <0 5 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, + <0 6 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, + <0 7 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, + <0 8 &gic GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, + <0 9 &gic GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, + <0 10 &gic GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>, + <0 11 &gic GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>, + <0 12 &gic GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>, + <0 13 &gic GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>, + <0 14 &gic GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>, + <0 15 &gic GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>, + <0 16 &gic GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>, + <0 17 &gic GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, + <0 18 &gic GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>, + <0 19 &gic GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>, + <0 20 &gic GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>, + <0 21 &gic GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>, + <0 22 &gic GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>, + <0 23 &gic GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>, + <0 24 &gic GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>, + <0 25 &gic GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>, + <0 26 &gic GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>, + <0 27 &gic GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, + <0 28 &gic GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>, + <0 29 &gic GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>, + <0 30 &gic GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>, + <0 31 &gic GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>, + <0 32 &gic GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, + <0 33 &gic GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, + <0 34 &gic GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, + <0 35 &gic GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, + <0 36 &gic GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, + <0 37 &gic GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>, + <0 38 &gic GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>, + <0 39 &gic GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>, + <0 40 &gic GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, + <0 41 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, + <0 42 &gic GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + + motherboard-bus@40000000 { arm,hbi = <0x190>; arm,vexpress,site = <0>; compatible = "arm,vexpress,v2m-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; + ranges = <0 0 0x40000000 0x04000000>, + <1 0 0x44000000 0x04000000>, + <2 0 0x48000000 0x04000000>, + <3 0 0x4c000000 0x04000000>, + <7 0 0x10000000 0x00020000>; flash@0,00000000 { compatible = "arm,vexpress-flash", "cfi-flash"; diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts b/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts index e63c5c0bfb43..679537e17ff5 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca15-tc1.dts @@ -237,62 +237,7 @@ }; bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; + ranges = <0x8000000 0 0x8000000 0x18000000>; }; site2: hsb@40000000 { diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts index 012d40a7228c..511e87cc2bc5 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts @@ -609,62 +609,7 @@ }; smb: bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; + ranges = <0x8000000 0 0x8000000 0x18000000>; }; site2: hsb@40000000 { diff --git a/arch/arm/boot/dts/vexpress-v2p-ca5s.dts b/arch/arm/boot/dts/vexpress-v2p-ca5s.dts index 7aa64ae25779..3b88209bacea 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca5s.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca5s.dts @@ -207,62 +207,7 @@ }; smb: bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0x08000000 0x04000000>, - <1 0 0x14000000 0x04000000>, - <2 0 0x18000000 0x04000000>, - <3 0 0x1c000000 0x04000000>, - <4 0 0x0c000000 0x04000000>, - <5 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; + ranges = <0 0x8000000 0x18000000>; }; site2: hsb@40000000 { diff --git a/arch/arm/boot/dts/vexpress-v2p-ca9.dts b/arch/arm/boot/dts/vexpress-v2p-ca9.dts index 4c5847955856..5916e4877eac 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca9.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca9.dts @@ -295,64 +295,6 @@ }; }; - smb: bus@4000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0x40000000 0x04000000>, - <1 0 0x44000000 0x04000000>, - <2 0 0x48000000 0x04000000>, - <3 0 0x4c000000 0x04000000>, - <7 0 0x10000000 0x00020000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic 0 0 4>, - <0 0 1 &gic 0 1 4>, - <0 0 2 &gic 0 2 4>, - <0 0 3 &gic 0 3 4>, - <0 0 4 &gic 0 4 4>, - <0 0 5 &gic 0 5 4>, - <0 0 6 &gic 0 6 4>, - <0 0 7 &gic 0 7 4>, - <0 0 8 &gic 0 8 4>, - <0 0 9 &gic 0 9 4>, - <0 0 10 &gic 0 10 4>, - <0 0 11 &gic 0 11 4>, - <0 0 12 &gic 0 12 4>, - <0 0 13 &gic 0 13 4>, - <0 0 14 &gic 0 14 4>, - <0 0 15 &gic 0 15 4>, - <0 0 16 &gic 0 16 4>, - <0 0 17 &gic 0 17 4>, - <0 0 18 &gic 0 18 4>, - <0 0 19 &gic 0 19 4>, - <0 0 20 &gic 0 20 4>, - <0 0 21 &gic 0 21 4>, - <0 0 22 &gic 0 22 4>, - <0 0 23 &gic 0 23 4>, - <0 0 24 &gic 0 24 4>, - <0 0 25 &gic 0 25 4>, - <0 0 26 &gic 0 26 4>, - <0 0 27 &gic 0 27 4>, - <0 0 28 &gic 0 28 4>, - <0 0 29 &gic 0 29 4>, - <0 0 30 &gic 0 30 4>, - <0 0 31 &gic 0 31 4>, - <0 0 32 &gic 0 32 4>, - <0 0 33 &gic 0 33 4>, - <0 0 34 &gic 0 34 4>, - <0 0 35 &gic 0 35 4>, - <0 0 36 &gic 0 36 4>, - <0 0 37 &gic 0 37 4>, - <0 0 38 &gic 0 38 4>, - <0 0 39 &gic 0 39 4>, - <0 0 40 &gic 0 40 4>, - <0 0 41 &gic 0 41 4>, - <0 0 42 &gic 0 42 4>; - }; - site2: hsb@e0000000 { compatible = "simple-bus"; #address-cells = <1>; diff --git a/arch/arm/common/sharpsl_param.c b/arch/arm/common/sharpsl_param.c index efeb5724d9e9..6237ede2f0c7 100644 --- a/arch/arm/common/sharpsl_param.c +++ b/arch/arm/common/sharpsl_param.c @@ -40,7 +40,9 @@ EXPORT_SYMBOL(sharpsl_param); void sharpsl_save_param(void) { - memcpy(&sharpsl_param, param_start(PARAM_BASE), sizeof(struct sharpsl_param_info)); + struct sharpsl_param_info *params = param_start(PARAM_BASE); + + memcpy(&sharpsl_param, params, sizeof(*params)); if (sharpsl_param.comadj_keyword != COMADJ_MAGIC) sharpsl_param.comadj=-1; diff --git a/arch/arm/configs/gemini_defconfig b/arch/arm/configs/gemini_defconfig index d2d5f1cf815f..e6ff844821cf 100644 --- a/arch/arm/configs/gemini_defconfig +++ b/arch/arm/configs/gemini_defconfig @@ -76,6 +76,7 @@ CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_DRM=y CONFIG_DRM_PANEL_ILITEK_IL9322=y CONFIG_DRM_TVE200=y +CONFIG_FB=y CONFIG_LOGO=y CONFIG_USB=y CONFIG_USB_MON=y diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index ccee86d0045d..5e4128dadd8d 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -292,6 +292,7 @@ CONFIG_DRM_IMX_LDB=y CONFIG_DRM_IMX_HDMI=y CONFIG_DRM_ETNAVIV=y CONFIG_DRM_MXSFB=y +CONFIG_FB=y CONFIG_FB_MODE_HELPERS=y CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_L4F00242T03=y diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index ba67c4717dcc..33572998dbbe 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -197,7 +197,6 @@ CONFIG_PCI_EPF_TEST=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_OMAP_OCP2SCP=y -CONFIG_SIMPLE_PM_BUS=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y @@ -456,6 +455,7 @@ CONFIG_PINCTRL_STMFX=y CONFIG_PINCTRL_PALMAS=y CONFIG_PINCTRL_OWL=y CONFIG_PINCTRL_S500=y +CONFIG_PINCTRL_MSM=y CONFIG_PINCTRL_APQ8064=y CONFIG_PINCTRL_APQ8084=y CONFIG_PINCTRL_IPQ8064=y @@ -725,6 +725,7 @@ CONFIG_DRM_PL111=m CONFIG_DRM_LIMA=m CONFIG_DRM_PANFROST=m CONFIG_DRM_ASPEED_GFX=m +CONFIG_FB=y CONFIG_FB_EFI=y CONFIG_FB_WM8505=y CONFIG_FB_SH_MOBILE_LCDC=y @@ -1122,6 +1123,7 @@ CONFIG_PHY_DM816X_USB=m CONFIG_OMAP_USB2=y CONFIG_TI_PIPE3=y CONFIG_TWL4030_USB=m +CONFIG_RAS=y CONFIG_NVMEM_IMX_OCOTP=y CONFIG_ROCKCHIP_EFUSE=m CONFIG_NVMEM_SUNXI_SID=y diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig index cae0db6b4eaf..de37f7e90999 100644 --- a/arch/arm/configs/oxnas_v6_defconfig +++ b/arch/arm/configs/oxnas_v6_defconfig @@ -46,7 +46,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=64 -CONFIG_SIMPLE_PM_BUS=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y diff --git a/arch/arm/configs/shmobile_defconfig b/arch/arm/configs/shmobile_defconfig index d9a27e4e0914..18d2a960b2d2 100644 --- a/arch/arm/configs/shmobile_defconfig +++ b/arch/arm/configs/shmobile_defconfig @@ -40,7 +40,6 @@ CONFIG_PCI_RCAR_GEN2=y CONFIG_PCIE_RCAR_HOST=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y -CONFIG_SIMPLE_PM_BUS=y CONFIG_MTD=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y diff --git a/arch/arm/include/asm/arch_timer.h b/arch/arm/include/asm/arch_timer.h index 99175812d903..bb129b6d2366 100644 --- a/arch/arm/include/asm/arch_timer.h +++ b/arch/arm/include/asm/arch_timer.h @@ -7,6 +7,7 @@ #include <asm/hwcap.h> #include <linux/clocksource.h> #include <linux/init.h> +#include <linux/io-64-nonatomic-lo-hi.h> #include <linux/types.h> #include <clocksource/arm_arch_timer.h> @@ -24,29 +25,35 @@ int arch_timer_arch_init(void); * the code. At least it does so with a recent GCC (4.6.3). */ static __always_inline -void arch_timer_reg_write_cp15(int access, enum arch_timer_reg reg, u32 val) +void arch_timer_reg_write_cp15(int access, enum arch_timer_reg reg, u64 val) { if (access == ARCH_TIMER_PHYS_ACCESS) { switch (reg) { case ARCH_TIMER_REG_CTRL: - asm volatile("mcr p15, 0, %0, c14, c2, 1" : : "r" (val)); + asm volatile("mcr p15, 0, %0, c14, c2, 1" : : "r" ((u32)val)); + isb(); break; - case ARCH_TIMER_REG_TVAL: - asm volatile("mcr p15, 0, %0, c14, c2, 0" : : "r" (val)); + case ARCH_TIMER_REG_CVAL: + asm volatile("mcrr p15, 2, %Q0, %R0, c14" : : "r" (val)); break; + default: + BUILD_BUG(); } } else if (access == ARCH_TIMER_VIRT_ACCESS) { switch (reg) { case ARCH_TIMER_REG_CTRL: - asm volatile("mcr p15, 0, %0, c14, c3, 1" : : "r" (val)); + asm volatile("mcr p15, 0, %0, c14, c3, 1" : : "r" ((u32)val)); + isb(); break; - case ARCH_TIMER_REG_TVAL: - asm volatile("mcr p15, 0, %0, c14, c3, 0" : : "r" (val)); + case ARCH_TIMER_REG_CVAL: + asm volatile("mcrr p15, 3, %Q0, %R0, c14" : : "r" (val)); break; + default: + BUILD_BUG(); } + } else { + BUILD_BUG(); } - - isb(); } static __always_inline @@ -59,19 +66,19 @@ u32 arch_timer_reg_read_cp15(int access, enum arch_timer_reg reg) case ARCH_TIMER_REG_CTRL: asm volatile("mrc p15, 0, %0, c14, c2, 1" : "=r" (val)); break; - case ARCH_TIMER_REG_TVAL: - asm volatile("mrc p15, 0, %0, c14, c2, 0" : "=r" (val)); - break; + default: + BUILD_BUG(); } } else if (access == ARCH_TIMER_VIRT_ACCESS) { switch (reg) { case ARCH_TIMER_REG_CTRL: asm volatile("mrc p15, 0, %0, c14, c3, 1" : "=r" (val)); break; - case ARCH_TIMER_REG_TVAL: - asm volatile("mrc p15, 0, %0, c14, c3, 0" : "=r" (val)); - break; + default: + BUILD_BUG(); } + } else { + BUILD_BUG(); } return val; diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 5e56288e343b..e68fb879e4f9 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -290,6 +290,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *folio); #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 084d1c07c2d0..36fbc3329252 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -176,6 +176,7 @@ extern int __get_user_64t_4(void *); register unsigned long __l asm("r1") = __limit; \ register int __e asm("r0"); \ unsigned int __ua_flags = uaccess_save_and_enable(); \ + int __tmp_e; \ switch (sizeof(*(__p))) { \ case 1: \ if (sizeof((x)) >= 8) \ @@ -203,9 +204,10 @@ extern int __get_user_64t_4(void *); break; \ default: __e = __get_user_bad(); break; \ } \ + __tmp_e = __e; \ uaccess_restore(__ua_flags); \ x = (typeof(*(p))) __r2; \ - __e; \ + __tmp_e; \ }) #define get_user(x, p) \ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 241b73d64df7..3d0b6169ab86 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -38,14 +38,11 @@ */ .macro irq_handler #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER - ldr r1, =handle_arch_irq mov r0, sp - badr lr, 9997f - ldr pc, [r1] + bl generic_handle_arch_irq #else arch_irq_handler_default #endif -9997: .endm .macro pabt_helper diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 29070eb8df7d..3fc7f9750ce4 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -253,7 +253,7 @@ __create_page_tables: add r0, r4, #KERNEL_OFFSET >> (SECTION_SHIFT - PMD_ORDER) ldr r6, =(_end - 1) adr_l r5, kernel_sec_start @ _pa(kernel_sec_start) -#ifdef CONFIG_CPU_ENDIAN_BE8 +#if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32 str r8, [r5, #4] @ Save physical start of kernel (BE) #else str r8, [r5] @ Save physical start of kernel (LE) @@ -266,7 +266,7 @@ __create_page_tables: bls 1b eor r3, r3, r7 @ Remove the MMU flags adr_l r5, kernel_sec_end @ _pa(kernel_sec_end) -#ifdef CONFIG_CPU_ENDIAN_BE8 +#if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32 str r3, [r5, #4] @ Save physical end of kernel (BE) #else str r3, [r5] @ Save physical end of kernel (LE) diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index 20ab1e607522..b79975bd988c 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -63,11 +63,8 @@ int arch_show_interrupts(struct seq_file *p, int prec) */ void handle_IRQ(unsigned int irq, struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; - irq_enter(); - /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. @@ -81,9 +78,6 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs) handle_irq_desc(desc); else ack_bad_irq(irq); - - irq_exit(); - set_irq_regs(old_regs); } /* @@ -92,7 +86,15 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs) asmlinkage void __exception_irq_entry asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { + struct pt_regs *old_regs; + + irq_enter(); + old_regs = set_irq_regs(regs); + handle_IRQ(irq, regs); + + set_irq_regs(old_regs); + irq_exit(); } void __init init_IRQ(void) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 4a7edc6e848f..195dff58bafc 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -136,7 +136,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, for (p = first, i = 0; i < 8 && p < top; i++, p += 4) { if (p >= bottom && p < top) { unsigned long val; - if (get_kernel_nofault(val, (unsigned long *)p)) + if (!get_kernel_nofault(val, (unsigned long *)p)) sprintf(str + i * 9, " %08lx", val); else sprintf(str + i * 9, " ????????"); diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index 50136828f5b5..f14c2360ea0b 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -40,6 +40,10 @@ SECTIONS ARM_DISCARD *(.alt.smp.init) *(.pv_table) +#ifndef CONFIG_ARM_UNWIND + *(.ARM.exidx) *(.ARM.exidx.*) + *(.ARM.extab) *(.ARM.extab.*) +#endif } . = XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR); @@ -172,7 +176,7 @@ ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined") ASSERT((_end - __bss_start) >= 12288, ".bss too small for CONFIG_XIP_DEFLATED_DATA") #endif -#ifdef CONFIG_ARM_MPU +#if defined(CONFIG_ARM_MPU) && !defined(CONFIG_COMPILE_TEST) /* * Due to PMSAv7 restriction on base address and size we have to * enforce minimal alignment restrictions. It was seen that weaker diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index d6cfe7c4bb00..8711d6824c1f 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -47,12 +47,26 @@ struct at91_pm_bu { unsigned long ddr_phy_calibration[BACKUP_DDR_PHY_CALIBRATION]; }; +/* + * struct at91_pm_sfrbu_offsets: registers mapping for SFRBU + * @pswbu: power switch BU control registers + */ +struct at91_pm_sfrbu_regs { + struct { + u32 key; + u32 ctrl; + u32 state; + u32 softsw; + } pswbu; +}; + /** * struct at91_soc_pm - AT91 SoC power management data structure * @config_shdwc_ws: wakeup sources configuration function for SHDWC * @config_pmc_ws: wakeup srouces configuration function for PMC * @ws_ids: wakup sources of_device_id array * @data: PM data to be used on last phase of suspend + * @sfrbu_regs: SFRBU registers mapping * @bu: backup unit mapped data (for backup mode) * @memcs: memory chip select */ @@ -62,6 +76,7 @@ struct at91_soc_pm { const struct of_device_id *ws_ids; struct at91_pm_bu *bu; struct at91_pm_data data; + struct at91_pm_sfrbu_regs sfrbu_regs; void *memcs; }; @@ -356,9 +371,36 @@ static int at91_suspend_finish(unsigned long val) return 0; } +static void at91_pm_switch_ba_to_vbat(void) +{ + unsigned int offset = offsetof(struct at91_pm_sfrbu_regs, pswbu); + unsigned int val; + + /* Just for safety. */ + if (!soc_pm.data.sfrbu) + return; + + val = readl(soc_pm.data.sfrbu + offset); + + /* Already on VBAT. */ + if (!(val & soc_pm.sfrbu_regs.pswbu.state)) + return; + + val &= ~soc_pm.sfrbu_regs.pswbu.softsw; + val |= soc_pm.sfrbu_regs.pswbu.key | soc_pm.sfrbu_regs.pswbu.ctrl; + writel(val, soc_pm.data.sfrbu + offset); + + /* Wait for update. */ + val = readl(soc_pm.data.sfrbu + offset); + while (val & soc_pm.sfrbu_regs.pswbu.state) + val = readl(soc_pm.data.sfrbu + offset); +} + static void at91_pm_suspend(suspend_state_t state) { if (soc_pm.data.mode == AT91_PM_BACKUP) { + at91_pm_switch_ba_to_vbat(); + cpu_suspend(0, at91_suspend_finish); /* The SRAM is lost between suspend cycles */ @@ -589,18 +631,22 @@ static const struct of_device_id ramc_phy_ids[] __initconst = { { /* Sentinel. */ }, }; -static __init void at91_dt_ramc(bool phy_mandatory) +static __init int at91_dt_ramc(bool phy_mandatory) { struct device_node *np; const struct of_device_id *of_id; int idx = 0; void *standby = NULL; const struct ramc_info *ramc; + int ret; for_each_matching_node_and_match(np, ramc_ids, &of_id) { soc_pm.data.ramc[idx] = of_iomap(np, 0); - if (!soc_pm.data.ramc[idx]) - panic(pr_fmt("unable to map ramc[%d] cpu registers\n"), idx); + if (!soc_pm.data.ramc[idx]) { + pr_err("unable to map ramc[%d] cpu registers\n", idx); + ret = -ENOMEM; + goto unmap_ramc; + } ramc = of_id->data; if (ramc) { @@ -612,25 +658,42 @@ static __init void at91_dt_ramc(bool phy_mandatory) idx++; } - if (!idx) - panic(pr_fmt("unable to find compatible ram controller node in dtb\n")); + if (!idx) { + pr_err("unable to find compatible ram controller node in dtb\n"); + ret = -ENODEV; + goto unmap_ramc; + } /* Lookup for DDR PHY node, if any. */ for_each_matching_node_and_match(np, ramc_phy_ids, &of_id) { soc_pm.data.ramc_phy = of_iomap(np, 0); - if (!soc_pm.data.ramc_phy) - panic(pr_fmt("unable to map ramc phy cpu registers\n")); + if (!soc_pm.data.ramc_phy) { + pr_err("unable to map ramc phy cpu registers\n"); + ret = -ENOMEM; + goto unmap_ramc; + } } - if (phy_mandatory && !soc_pm.data.ramc_phy) - panic(pr_fmt("DDR PHY is mandatory!\n")); + if (phy_mandatory && !soc_pm.data.ramc_phy) { + pr_err("DDR PHY is mandatory!\n"); + ret = -ENODEV; + goto unmap_ramc; + } if (!standby) { pr_warn("ramc no standby function available\n"); - return; + return 0; } at91_cpuidle_device.dev.platform_data = standby; + + return 0; + +unmap_ramc: + while (idx) + iounmap(soc_pm.data.ramc[--idx]); + + return ret; } static void at91rm9200_idle(void) @@ -1017,6 +1080,8 @@ static void __init at91_pm_init(void (*pm_idle)(void)) void __init at91rm9200_pm_init(void) { + int ret; + if (!IS_ENABLED(CONFIG_SOC_AT91RM9200)) return; @@ -1028,7 +1093,9 @@ void __init at91rm9200_pm_init(void) soc_pm.data.standby_mode = AT91_PM_STANDBY; soc_pm.data.suspend_mode = AT91_PM_ULP0; - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; /* * AT91RM9200 SDRAM low-power mode cannot be used with self-refresh. @@ -1046,13 +1113,17 @@ void __init sam9x60_pm_init(void) static const int iomaps[] __initconst = { [AT91_PM_ULP1] = AT91_PM_IOMAP(SHDWC), }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAM9X60)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); at91_pm_modes_init(iomaps, ARRAY_SIZE(iomaps)); - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(NULL); soc_pm.ws_ids = sam9x60_ws_ids; @@ -1061,6 +1132,8 @@ void __init sam9x60_pm_init(void) void __init at91sam9_pm_init(void) { + int ret; + if (!IS_ENABLED(CONFIG_SOC_AT91SAM9)) return; @@ -1072,7 +1145,10 @@ void __init at91sam9_pm_init(void) soc_pm.data.standby_mode = AT91_PM_STANDBY; soc_pm.data.suspend_mode = AT91_PM_ULP0; - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(at91sam9_idle); } @@ -1081,12 +1157,16 @@ void __init sama5_pm_init(void) static const int modes[] __initconst = { AT91_PM_STANDBY, AT91_PM_ULP0, AT91_PM_ULP0_FAST, }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAMA5)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(NULL); } @@ -1101,18 +1181,27 @@ void __init sama5d2_pm_init(void) [AT91_PM_BACKUP] = AT91_PM_IOMAP(SHDWC) | AT91_PM_IOMAP(SFRBU), }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAMA5D2)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); at91_pm_modes_init(iomaps, ARRAY_SIZE(iomaps)); - at91_dt_ramc(false); + ret = at91_dt_ramc(false); + if (ret) + return; + at91_pm_init(NULL); soc_pm.ws_ids = sama5d2_ws_ids; soc_pm.config_shdwc_ws = at91_sama5d2_config_shdwc_ws; soc_pm.config_pmc_ws = at91_sama5d2_config_pmc_ws; + + soc_pm.sfrbu_regs.pswbu.key = (0x4BD20C << 8); + soc_pm.sfrbu_regs.pswbu.ctrl = BIT(0); + soc_pm.sfrbu_regs.pswbu.softsw = BIT(1); + soc_pm.sfrbu_regs.pswbu.state = BIT(3); } void __init sama7_pm_init(void) @@ -1127,18 +1216,27 @@ void __init sama7_pm_init(void) [AT91_PM_BACKUP] = AT91_PM_IOMAP(SFRBU) | AT91_PM_IOMAP(SHDWC), }; + int ret; if (!IS_ENABLED(CONFIG_SOC_SAMA7)) return; at91_pm_modes_validate(modes, ARRAY_SIZE(modes)); - at91_dt_ramc(true); + ret = at91_dt_ramc(true); + if (ret) + return; + at91_pm_modes_init(iomaps, ARRAY_SIZE(iomaps)); at91_pm_init(NULL); soc_pm.ws_ids = sama7g5_ws_ids; soc_pm.config_pmc_ws = at91_sam9x60_config_pmc_ws; + + soc_pm.sfrbu_regs.pswbu.key = (0x4BD20C << 8); + soc_pm.sfrbu_regs.pswbu.ctrl = BIT(0); + soc_pm.sfrbu_regs.pswbu.softsw = BIT(1); + soc_pm.sfrbu_regs.pswbu.state = BIT(2); } static int __init at91_pm_modes_select(char *str) diff --git a/arch/arm/mach-at91/pm_suspend.S b/arch/arm/mach-at91/pm_suspend.S index cbd61a3bcab1..fdb4f63ecde4 100644 --- a/arch/arm/mach-at91/pm_suspend.S +++ b/arch/arm/mach-at91/pm_suspend.S @@ -1014,31 +1014,55 @@ ENTRY(at91_pm_suspend_in_sram) mov tmp1, #0 mcr p15, 0, tmp1, c7, c10, 4 + /* Flush tlb. */ + mov r4, #0 + mcr p15, 0, r4, c8, c7, 0 + + ldr tmp1, [r0, #PM_DATA_PMC_MCKR_OFFSET] + str tmp1, .mckr_offset + ldr tmp1, [r0, #PM_DATA_PMC_VERSION] + str tmp1, .pmc_version + ldr tmp1, [r0, #PM_DATA_MEMCTRL] + str tmp1, .memtype + ldr tmp1, [r0, #PM_DATA_MODE] + str tmp1, .pm_mode + + /* + * ldrne below are here to preload their address in the TLB as access + * to RAM may be limited while in self-refresh. + */ ldr tmp1, [r0, #PM_DATA_PMC] str tmp1, .pmc_base + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + ldr tmp1, [r0, #PM_DATA_RAMC0] str tmp1, .sramc_base + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + ldr tmp1, [r0, #PM_DATA_RAMC1] str tmp1, .sramc1_base + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + +#ifndef CONFIG_SOC_SAM_V4_V5 + /* ldrne below are here to preload their address in the TLB */ ldr tmp1, [r0, #PM_DATA_RAMC_PHY] str tmp1, .sramc_phy_base - ldr tmp1, [r0, #PM_DATA_MEMCTRL] - str tmp1, .memtype - ldr tmp1, [r0, #PM_DATA_MODE] - str tmp1, .pm_mode - ldr tmp1, [r0, #PM_DATA_PMC_MCKR_OFFSET] - str tmp1, .mckr_offset - ldr tmp1, [r0, #PM_DATA_PMC_VERSION] - str tmp1, .pmc_version - /* Both ldrne below are here to preload their address in the TLB */ + cmp tmp1, #0 + ldrne tmp2, [tmp1, #0] + ldr tmp1, [r0, #PM_DATA_SHDWC] str tmp1, .shdwc cmp tmp1, #0 ldrne tmp2, [tmp1, #0] + ldr tmp1, [r0, #PM_DATA_SFRBU] str tmp1, .sfrbu cmp tmp1, #0 ldrne tmp2, [tmp1, #0x10] +#endif /* Active the self-refresh mode */ at91_sramc_self_refresh_ena diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig index 2890e61b2b46..bd3f82788ebc 100644 --- a/arch/arm/mach-bcm/Kconfig +++ b/arch/arm/mach-bcm/Kconfig @@ -161,7 +161,6 @@ config ARCH_BCM2835 select ARM_TIMER_SP804 select HAVE_ARM_ARCH_TIMER if ARCH_MULTI_V7 select BCM2835_TIMER - select BRCMSTB_L2_IRQ select PINCTRL select PINCTRL_BCM2835 select MFD_CORE @@ -209,9 +208,6 @@ config ARCH_BRCMSTB select ARM_GIC select ARM_ERRATA_798181 if SMP select HAVE_ARM_ARCH_TIMER - select BCM7038_L1_IRQ - select BRCMSTB_L2_IRQ - select BCM7120_L2_IRQ select ZONE_DMA if ARM_LPAE select SOC_BRCMSTB select SOC_BUS diff --git a/arch/arm/mach-dove/include/mach/uncompress.h b/arch/arm/mach-dove/include/mach/uncompress.h index 7a4bd8838036..ddf873f35e2b 100644 --- a/arch/arm/mach-dove/include/mach/uncompress.h +++ b/arch/arm/mach-dove/include/mach/uncompress.h @@ -11,7 +11,7 @@ #define LSR_THRE 0x20 -static void putc(const char c) +static inline void putc(const char c) { int i; @@ -24,7 +24,7 @@ static void putc(const char c) *UART_THR = c; } -static void flush(void) +static inline void flush(void) { } diff --git a/arch/arm/mach-imx/avic.c b/arch/arm/mach-imx/avic.c index 21bce4049cec..cf6546ddc7a3 100644 --- a/arch/arm/mach-imx/avic.c +++ b/arch/arm/mach-imx/avic.c @@ -154,7 +154,7 @@ static void __exception_irq_entry avic_handle_irq(struct pt_regs *regs) if (nivector == 0xffff) break; - handle_domain_irq(domain, nivector, regs); + generic_handle_domain_irq(domain, nivector); } while (1); } diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c index 11dcc369ec14..c9d7c29d95e1 100644 --- a/arch/arm/mach-imx/mach-imx6q.c +++ b/arch/arm/mach-imx/mach-imx6q.c @@ -172,6 +172,9 @@ static void __init imx6q_init_machine(void) imx_get_soc_revision()); imx6q_enet_phy_init(); + + of_platform_default_populate(NULL, NULL, NULL); + imx_anatop_init(); cpu_is_imx6q() ? imx6q_pm_init() : imx6dl_pm_init(); imx6q_1588_init(); diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index 9244437cb1b9..f2ecca339910 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -10,6 +10,7 @@ #include <linux/io.h> #include <linux/irq.h> #include <linux/genalloc.h> +#include <linux/irqchip/arm-gic.h> #include <linux/mfd/syscon.h> #include <linux/mfd/syscon/imx6q-iomuxc-gpr.h> #include <linux/of.h> @@ -619,6 +620,7 @@ static void __init imx6_pm_common_init(const struct imx6_pm_socdata static void imx6_pm_stby_poweroff(void) { + gic_cpu_if_down(0); imx6_set_lpm(STOP_POWER_OFF); imx6q_suspend_finish(0); diff --git a/arch/arm/mach-imx/src.c b/arch/arm/mach-imx/src.c index 95fd1fbb0826..59a8e8cc4469 100644 --- a/arch/arm/mach-imx/src.c +++ b/arch/arm/mach-imx/src.c @@ -9,6 +9,7 @@ #include <linux/iopoll.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/platform_device.h> #include <linux/reset-controller.h> #include <linux/smp.h> #include <asm/smp_plat.h> @@ -81,11 +82,6 @@ static const struct reset_control_ops imx_src_ops = { .reset = imx_src_reset_module, }; -static struct reset_controller_dev imx_reset_controller = { - .ops = &imx_src_ops, - .nr_resets = ARRAY_SIZE(sw_reset_bits), -}; - static void imx_gpcv2_set_m_core_pgc(bool enable, u32 offset) { writel_relaxed(enable, gpc_base + offset); @@ -177,10 +173,6 @@ void __init imx_src_init(void) src_base = of_iomap(np, 0); WARN_ON(!src_base); - imx_reset_controller.of_node = np; - if (IS_ENABLED(CONFIG_RESET_CONTROLLER)) - reset_controller_register(&imx_reset_controller); - /* * force warm reset sources to generate cold reset * for a more reliable restart @@ -214,3 +206,33 @@ void __init imx7_src_init(void) if (!gpc_base) return; } + +static const struct of_device_id imx_src_dt_ids[] = { + { .compatible = "fsl,imx51-src" }, + { /* sentinel */ } +}; + +static int imx_src_probe(struct platform_device *pdev) +{ + struct reset_controller_dev *rcdev; + + rcdev = devm_kzalloc(&pdev->dev, sizeof(*rcdev), GFP_KERNEL); + if (!rcdev) + return -ENOMEM; + + rcdev->ops = &imx_src_ops; + rcdev->dev = &pdev->dev; + rcdev->of_node = pdev->dev.of_node; + rcdev->nr_resets = ARRAY_SIZE(sw_reset_bits); + + return devm_reset_controller_register(&pdev->dev, rcdev); +} + +static struct platform_driver imx_src_driver = { + .driver = { + .name = "imx-src", + .of_match_table = imx_src_dt_ids, + }, + .probe = imx_src_probe, +}; +builtin_platform_driver(imx_src_driver); diff --git a/arch/arm/mach-imx/tzic.c b/arch/arm/mach-imx/tzic.c index 479a01bdac56..8b3d98d288d9 100644 --- a/arch/arm/mach-imx/tzic.c +++ b/arch/arm/mach-imx/tzic.c @@ -134,7 +134,7 @@ static void __exception_irq_entry tzic_handle_irq(struct pt_regs *regs) while (stat) { handled = 1; irqofs = fls(stat) - 1; - handle_domain_irq(domain, irqofs + i * 32, regs); + generic_handle_domain_irq(domain, irqofs + i * 32); stat &= ~(1 << irqofs); } } diff --git a/arch/arm/mach-omap1/include/mach/memory.h b/arch/arm/mach-omap1/include/mach/memory.h index 36bc0000cb6a..ba3a350479c8 100644 --- a/arch/arm/mach-omap1/include/mach/memory.h +++ b/arch/arm/mach-omap1/include/mach/memory.h @@ -9,16 +9,4 @@ /* REVISIT: omap1 legacy drivers still rely on this */ #include <mach/soc.h> -/* - * Bus address is physical address, except for OMAP-1510 Local Bus. - * OMAP-1510 bus address is translated into a Local Bus address if the - * OMAP bus type is lbus. We do the address translation based on the - * device overriding the defaults used in the dma-mapping API. - */ - -/* - * OMAP-1510 Local Bus address offset - */ -#define OMAP1510_LB_OFFSET UL(0x30000000) - #endif diff --git a/arch/arm/mach-omap1/irq.c b/arch/arm/mach-omap1/irq.c index b11edc8a46f0..ee6a93083154 100644 --- a/arch/arm/mach-omap1/irq.c +++ b/arch/arm/mach-omap1/irq.c @@ -165,7 +165,7 @@ asmlinkage void __exception_irq_entry omap1_handle_irq(struct pt_regs *regs) } irq: if (irqnr) - handle_domain_irq(domain, irqnr, regs); + generic_handle_domain_irq(domain, irqnr); else break; } while (irqnr); diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c index 86d3b3c157af..e60831c82b78 100644 --- a/arch/arm/mach-omap1/usb.c +++ b/arch/arm/mach-omap1/usb.c @@ -11,6 +11,7 @@ #include <linux/platform_device.h> #include <linux/dma-map-ops.h> #include <linux/io.h> +#include <linux/delay.h> #include <asm/irq.h> @@ -206,8 +207,6 @@ static inline void udc_device_init(struct omap_usb_config *pdata) #endif -#if IS_ENABLED(CONFIG_USB_OHCI_HCD) - /* The dmamask must be set for OHCI to work */ static u64 ohci_dmamask = ~(u32)0; @@ -236,20 +235,15 @@ static struct platform_device ohci_device = { static inline void ohci_device_init(struct omap_usb_config *pdata) { + if (!IS_ENABLED(CONFIG_USB_OHCI_HCD)) + return; + if (cpu_is_omap7xx()) ohci_resources[1].start = INT_7XX_USB_HHC_1; pdata->ohci_device = &ohci_device; pdata->ocpi_enable = &ocpi_enable; } -#else - -static inline void ohci_device_init(struct omap_usb_config *pdata) -{ -} - -#endif - #if defined(CONFIG_USB_OTG) && defined(CONFIG_ARCH_OMAP_OTG) static struct resource otg_resources[] = { @@ -534,33 +528,87 @@ bad: } #ifdef CONFIG_ARCH_OMAP15XX +/* OMAP-1510 OHCI has its own MMU for DMA */ +#define OMAP1510_LB_MEMSIZE 32 /* Should be same as SDRAM size */ +#define OMAP1510_LB_CLOCK_DIV 0xfffec10c +#define OMAP1510_LB_MMU_CTL 0xfffec208 +#define OMAP1510_LB_MMU_LCK 0xfffec224 +#define OMAP1510_LB_MMU_LD_TLB 0xfffec228 +#define OMAP1510_LB_MMU_CAM_H 0xfffec22c +#define OMAP1510_LB_MMU_CAM_L 0xfffec230 +#define OMAP1510_LB_MMU_RAM_H 0xfffec234 +#define OMAP1510_LB_MMU_RAM_L 0xfffec238 -/* ULPD_DPLL_CTRL */ -#define DPLL_IOB (1 << 13) -#define DPLL_PLL_ENABLE (1 << 4) -#define DPLL_LOCK (1 << 0) +/* + * Bus address is physical address, except for OMAP-1510 Local Bus. + * OMAP-1510 bus address is translated into a Local Bus address if the + * OMAP bus type is lbus. + */ +#define OMAP1510_LB_OFFSET UL(0x30000000) -/* ULPD_APLL_CTRL */ -#define APLL_NDPLL_SWITCH (1 << 0) +/* + * OMAP-1510 specific Local Bus clock on/off + */ +static int omap_1510_local_bus_power(int on) +{ + if (on) { + omap_writel((1 << 1) | (1 << 0), OMAP1510_LB_MMU_CTL); + udelay(200); + } else { + omap_writel(0, OMAP1510_LB_MMU_CTL); + } -static int omap_1510_usb_ohci_notifier(struct notifier_block *nb, - unsigned long event, void *data) + return 0; +} + +/* + * OMAP-1510 specific Local Bus initialization + * NOTE: This assumes 32MB memory size in OMAP1510LB_MEMSIZE. + * See also arch/mach-omap/memory.h for __virt_to_dma() and + * __dma_to_virt() which need to match with the physical + * Local Bus address below. + */ +static int omap_1510_local_bus_init(void) { - struct device *dev = data; + unsigned int tlb; + unsigned long lbaddr, physaddr; + + omap_writel((omap_readl(OMAP1510_LB_CLOCK_DIV) & 0xfffffff8) | 0x4, + OMAP1510_LB_CLOCK_DIV); + + /* Configure the Local Bus MMU table */ + for (tlb = 0; tlb < OMAP1510_LB_MEMSIZE; tlb++) { + lbaddr = tlb * 0x00100000 + OMAP1510_LB_OFFSET; + physaddr = tlb * 0x00100000 + PHYS_OFFSET; + omap_writel((lbaddr & 0x0fffffff) >> 22, OMAP1510_LB_MMU_CAM_H); + omap_writel(((lbaddr & 0x003ffc00) >> 6) | 0xc, + OMAP1510_LB_MMU_CAM_L); + omap_writel(physaddr >> 16, OMAP1510_LB_MMU_RAM_H); + omap_writel((physaddr & 0x0000fc00) | 0x300, OMAP1510_LB_MMU_RAM_L); + omap_writel(tlb << 4, OMAP1510_LB_MMU_LCK); + omap_writel(0x1, OMAP1510_LB_MMU_LD_TLB); + } - if (event != BUS_NOTIFY_ADD_DEVICE) - return NOTIFY_DONE; + /* Enable the walking table */ + omap_writel(omap_readl(OMAP1510_LB_MMU_CTL) | (1 << 3), OMAP1510_LB_MMU_CTL); + udelay(200); - if (strncmp(dev_name(dev), "ohci", 4) == 0 && - dma_direct_set_offset(dev, PHYS_OFFSET, OMAP1510_LB_OFFSET, - (u64)-1)) - WARN_ONCE(1, "failed to set DMA offset\n"); - return NOTIFY_OK; + return 0; } -static struct notifier_block omap_1510_usb_ohci_nb = { - .notifier_call = omap_1510_usb_ohci_notifier, -}; +static void omap_1510_local_bus_reset(void) +{ + omap_1510_local_bus_power(1); + omap_1510_local_bus_init(); +} + +/* ULPD_DPLL_CTRL */ +#define DPLL_IOB (1 << 13) +#define DPLL_PLL_ENABLE (1 << 4) +#define DPLL_LOCK (1 << 0) + +/* ULPD_APLL_CTRL */ +#define APLL_NDPLL_SWITCH (1 << 0) static void __init omap_1510_usb_init(struct omap_usb_config *config) { @@ -616,19 +664,19 @@ static void __init omap_1510_usb_init(struct omap_usb_config *config) } #endif -#if IS_ENABLED(CONFIG_USB_OHCI_HCD) - if (config->register_host) { + if (IS_ENABLED(CONFIG_USB_OHCI_HCD) && config->register_host) { int status; - bus_register_notifier(&platform_bus_type, - &omap_1510_usb_ohci_nb); ohci_device.dev.platform_data = config; + dma_direct_set_offset(&ohci_device.dev, PHYS_OFFSET, + OMAP1510_LB_OFFSET, (u64)-1); status = platform_device_register(&ohci_device); if (status) pr_debug("can't register OHCI device, %d\n", status); /* hcd explicitly gates 48MHz */ + + config->lb_reset = omap_1510_local_bus_reset; } -#endif } #else diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index 7f13adf26e61..02c253de9b6e 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -112,7 +112,6 @@ config ARCH_OMAP2PLUS select PM_GENERIC_DOMAINS select PM_GENERIC_DOMAINS_OF select RESET_CONTROLLER - select SIMPLE_PM_BUS select SOC_BUS select TI_SYSC select OMAP_IRQCHIP diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 12b26e04686f..0c2936c7a379 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -3614,6 +3614,8 @@ int omap_hwmod_init_module(struct device *dev, oh->flags |= HWMOD_SWSUP_SIDLE_ACT; if (data->cfg->quirks & SYSC_QUIRK_SWSUP_MSTANDBY) oh->flags |= HWMOD_SWSUP_MSTANDBY; + if (data->cfg->quirks & SYSC_QUIRK_CLKDM_NOAUTO) + oh->flags |= HWMOD_CLKDM_NOAUTO; error = omap_hwmod_check_module(dev, oh, data, sysc_fields, rev_offs, sysc_offs, syss_offs, diff --git a/arch/arm/mach-s3c/irq-s3c24xx.c b/arch/arm/mach-s3c/irq-s3c24xx.c index 3edc5f614eef..45dfd546e6fa 100644 --- a/arch/arm/mach-s3c/irq-s3c24xx.c +++ b/arch/arm/mach-s3c/irq-s3c24xx.c @@ -354,7 +354,7 @@ static inline int s3c24xx_handle_intc(struct s3c_irq_intc *intc, if (!(pnd & (1 << offset))) offset = __ffs(pnd); - handle_domain_irq(intc->domain, intc_offset + offset, regs); + generic_handle_domain_irq(intc->domain, intc_offset + offset); return true; } diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index e2c743aa2eb2..d9f7dfe2a7ed 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -340,6 +340,7 @@ ENTRY(\name\()_cache_fns) .macro define_tlb_functions name:req, flags_up:req, flags_smp .type \name\()_tlb_fns, #object + .align 2 ENTRY(\name\()_tlb_fns) .long \name\()_flush_user_tlb_range .long \name\()_flush_kern_tlb_range diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a951276f0547..a903b26cde40 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -36,6 +36,10 @@ * +-----+ * |RSVD | JIT scratchpad * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE) + * | ... | caller-saved registers + * +-----+ + * | ... | arguments passed on stack + * ARM_SP during call => +-----| * | | * | ... | Function call stack * | | @@ -63,6 +67,12 @@ * * When popping registers off the stack at the end of a BPF function, we * reference them via the current ARM_FP register. + * + * Some eBPF operations are implemented via a call to a helper function. + * Such calls are "invisible" in the eBPF code, so it is up to the calling + * program to preserve any caller-saved ARM registers during the call. The + * JIT emits code to push and pop those registers onto the stack, immediately + * above the callee stack frame. */ #define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \ 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R9 | \ @@ -70,6 +80,8 @@ #define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR) #define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC) +#define CALLER_MASK (1 << ARM_R0 | 1 << ARM_R1 | 1 << ARM_R2 | 1 << ARM_R3) + enum { /* Stack layout - these are offsets from (top of stack - 4) */ BPF_R2_HI, @@ -464,6 +476,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) { + const int exclude_mask = BIT(ARM_R0) | BIT(ARM_R1); const s8 *tmp = bpf2a32[TMP_REG_1]; #if __LINUX_ARM_ARCH__ == 7 @@ -495,11 +508,17 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) emit(ARM_MOV_R(ARM_R0, rm), ctx); } + /* Push caller-saved registers on stack */ + emit(ARM_PUSH(CALLER_MASK & ~exclude_mask), ctx); + /* Call appropriate function */ emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx); emit_blx_r(ARM_IP, ctx); + /* Restore caller-saved registers from stack */ + emit(ARM_POP(CALLER_MASK & ~exclude_mask), ctx); + /* Save return value */ if (rd != ARM_R0) emit(ARM_MOV_R(rd, ARM_R0), ctx); diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 27e0af78e88b..9d8634e2f12f 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -439,7 +439,7 @@ static struct undef_hook kprobes_arm_break_hook = { #endif /* !CONFIG_THUMB2_KERNEL */ -int __init arch_init_kprobes() +int __init arch_init_kprobes(void) { arm_probes_decode_init(); #ifdef CONFIG_THUMB2_KERNEL diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index e842209e135d..543100151f2b 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -462,3 +462,4 @@ 446 common landlock_restrict_self sys_landlock_restrict_self # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d13677f4731d..bc8d3702f4ed 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -133,7 +133,6 @@ config ARM64 select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY select GENERIC_VDSO_TIME_NS - select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND select HAVE_MOVE_PMD select HAVE_MOVE_PUD @@ -1940,8 +1939,6 @@ source "drivers/cpufreq/Kconfig" endmenu -source "drivers/firmware/Kconfig" - source "drivers/acpi/Kconfig" source "arch/arm64/kvm/Kconfig" diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index b0ce18d4cc98..96a81967c3bf 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -44,7 +44,6 @@ config ARCH_BCM2835 select ARM_AMBA select ARM_GIC select ARM_TIMER_SP804 - select BRCMSTB_L2_IRQ help This enables support for the Broadcom BCM2837 and BCM2711 SoC. These SoCs are used in the Raspberry Pi 3 and 4 devices. @@ -82,8 +81,6 @@ config ARCH_BITMAIN config ARCH_BRCMSTB bool "Broadcom Set-Top-Box SoCs" select ARCH_HAS_RESET_CONTROLLER - select BCM7038_L1_IRQ - select BRCMSTB_L2_IRQ select GENERIC_IRQ_CHIP select PINCTRL help @@ -167,7 +164,6 @@ config ARCH_MEDIATEK config ARCH_MESON bool "Amlogic Platforms" select COMMON_CLK - select MESON_IRQ_GPIO help This enables support for the arm64 based Amlogic SoCs such as the s905, S905X/D, S912, A113X/D or S905X/D2 diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts index 02f8e72f0cad..05486cccee1c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts @@ -75,7 +75,7 @@ pinctrl-0 = <&emac_rgmii_pins>; phy-supply = <®_gmac_3v3>; phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; status = "okay"; }; diff --git a/arch/arm64/boot/dts/arm/foundation-v8.dtsi b/arch/arm64/boot/dts/arm/foundation-v8.dtsi index 05ae893d1b2e..fbf13f7c2baf 100644 --- a/arch/arm64/boot/dts/arm/foundation-v8.dtsi +++ b/arch/arm64/boot/dts/arm/foundation-v8.dtsi @@ -115,7 +115,6 @@ bus@8000000 { compatible = "arm,vexpress,v2m-p1", "simple-bus"; - arm,v2m-memory-map = "rs1"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts b/arch/arm64/boot/dts/arm/fvp-base-revc.dts index b8a21092db4d..269b649934b5 100644 --- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts +++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts @@ -192,32 +192,9 @@ remote-endpoint = <&clcd_pads>; }; }; - - panel-timing { - clock-frequency = <63500127>; - hactive = <1024>; - hback-porch = <152>; - hfront-porch = <48>; - hsync-len = <104>; - vactive = <768>; - vback-porch = <23>; - vfront-porch = <3>; - vsync-len = <4>; - }; }; bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - #interrupt-cells = <1>; interrupt-map-mask = <0 0 63>; interrupt-map = <0 0 0 &gic 0 0 GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/arm/juno-base.dtsi b/arch/arm64/boot/dts/arm/juno-base.dtsi index 8e7a66943b01..6288e104a089 100644 --- a/arch/arm64/boot/dts/arm/juno-base.dtsi +++ b/arch/arm64/boot/dts/arm/juno-base.dtsi @@ -27,8 +27,6 @@ reg = <0x0 0x2b1f0000 0x0 0x1000>; interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "mhu_lpri_rx", - "mhu_hpri_rx"; #mbox-cells = <1>; clocks = <&soc_refclk100mhz>; clock-names = "apb_pclk"; @@ -804,16 +802,6 @@ }; bus@8000000 { - compatible = "simple-bus"; - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - #interrupt-cells = <1>; interrupt-map-mask = <0 0 15>; interrupt-map = <0 0 0 &gic 0 GIC_SPI 68 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/arm/juno-motherboard.dtsi b/arch/arm64/boot/dts/arm/juno-motherboard.dtsi index 40d95c58b55e..fefd2b5f0176 100644 --- a/arch/arm64/boot/dts/arm/juno-motherboard.dtsi +++ b/arch/arm64/boot/dts/arm/juno-motherboard.dtsi @@ -92,16 +92,23 @@ }; bus@8000000 { - motherboard-bus { + compatible = "simple-bus"; + #address-cells = <2>; + #size-cells = <1>; + ranges = <0 0x8000000 0 0x8000000 0x18000000>; + + motherboard-bus@8000000 { compatible = "arm,vexpress,v2p-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; - model = "V2M-Juno"; + ranges = <0 0 0 0x08000000 0x04000000>, + <1 0 0 0x14000000 0x04000000>, + <2 0 0 0x18000000 0x04000000>, + <3 0 0 0x1c000000 0x04000000>, + <4 0 0 0x0c000000 0x04000000>, + <5 0 0 0x10000000 0x04000000>; arm,hbi = <0x252>; arm,vexpress,site = <0>; - arm,v2m-memory-map = "rs1"; flash@0 { /* 2 * 32MiB NOR Flash memory mounted on CS0 */ @@ -218,7 +225,7 @@ }; }; - mmci@50000 { + mmc@50000 { compatible = "arm,pl180", "arm,primecell"; reg = <0x050000 0x1000>; interrupts = <5>; @@ -246,7 +253,7 @@ clock-names = "KMIREFCLK", "apb_pclk"; }; - wdt@f0000 { + watchdog@f0000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0f0000 0x10000>; interrupts = <7>; diff --git a/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts b/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts index 3050f45bade4..258991ad7cc0 100644 --- a/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts +++ b/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts @@ -133,17 +133,6 @@ }; bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - #interrupt-cells = <1>; interrupt-map-mask = <0 0 63>; interrupt-map = <0 0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi index b917d9d3f1c4..33182d9e5826 100644 --- a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi +++ b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard-rs2.dtsi @@ -6,7 +6,7 @@ */ / { bus@8000000 { - motherboard-bus { + motherboard-bus@8000000 { arm,v2m-memory-map = "rs2"; iofpga-bus@300000000 { diff --git a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi index 4c4a381d2c75..5f6cab668aa0 100644 --- a/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi +++ b/arch/arm64/boot/dts/arm/rtsm_ve-motherboard.dtsi @@ -77,13 +77,21 @@ }; bus@8000000 { - motherboard-bus { - arm,v2m-memory-map = "rs1"; + compatible = "simple-bus"; + #address-cells = <2>; + #size-cells = <1>; + ranges = <0 0x8000000 0 0x8000000 0x18000000>; + + motherboard-bus@8000000 { compatible = "arm,vexpress,v2m-p1", "simple-bus"; #address-cells = <2>; /* SMB chipselect number and offset */ #size-cells = <1>; - #interrupt-cells = <1>; - ranges; + ranges = <0 0 0 0x08000000 0x04000000>, + <1 0 0 0x14000000 0x04000000>, + <2 0 0 0x18000000 0x04000000>, + <3 0 0 0x1c000000 0x04000000>, + <4 0 0 0x0c000000 0x04000000>, + <5 0 0 0x10000000 0x04000000>; flash@0 { compatible = "arm,vexpress-flash", "cfi-flash"; @@ -130,7 +138,7 @@ clock-names = "apb_pclk"; }; - mmci@50000 { + mmc@50000 { compatible = "arm,pl180", "arm,primecell"; reg = <0x050000 0x1000>; interrupts = <9>, <10>; @@ -190,7 +198,7 @@ clock-names = "uartclk", "apb_pclk"; }; - wdt@f0000 { + watchdog@f0000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0f0000 0x1000>; interrupts = <0>; diff --git a/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts b/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts index d859914500a7..5b6d9d8e934d 100644 --- a/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts +++ b/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts @@ -145,61 +145,6 @@ }; smb: bus@8000000 { - compatible = "simple-bus"; - - #address-cells = <2>; - #size-cells = <1>; - ranges = <0 0 0 0x08000000 0x04000000>, - <1 0 0 0x14000000 0x04000000>, - <2 0 0 0x18000000 0x04000000>, - <3 0 0 0x1c000000 0x04000000>, - <4 0 0 0x0c000000 0x04000000>, - <5 0 0 0x10000000 0x04000000>; - - #interrupt-cells = <1>; - interrupt-map-mask = <0 0 63>; - interrupt-map = <0 0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, - <0 0 1 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, - <0 0 2 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, - <0 0 3 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, - <0 0 4 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, - <0 0 5 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, - <0 0 6 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, - <0 0 7 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, - <0 0 8 &gic GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, - <0 0 9 &gic GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, - <0 0 10 &gic GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>, - <0 0 11 &gic GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>, - <0 0 12 &gic GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>, - <0 0 13 &gic GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>, - <0 0 14 &gic GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>, - <0 0 15 &gic GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>, - <0 0 16 &gic GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>, - <0 0 17 &gic GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, - <0 0 18 &gic GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>, - <0 0 19 &gic GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>, - <0 0 20 &gic GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>, - <0 0 21 &gic GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>, - <0 0 22 &gic GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>, - <0 0 23 &gic GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>, - <0 0 24 &gic GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>, - <0 0 25 &gic GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>, - <0 0 26 &gic GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>, - <0 0 27 &gic GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, - <0 0 28 &gic GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>, - <0 0 29 &gic GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>, - <0 0 30 &gic GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>, - <0 0 31 &gic GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>, - <0 0 32 &gic GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, - <0 0 33 &gic GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, - <0 0 34 &gic GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, - <0 0 35 &gic GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, - <0 0 36 &gic GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>, - <0 0 37 &gic GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>, - <0 0 38 &gic GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>, - <0 0 39 &gic GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>, - <0 0 40 &gic GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, - <0 0 41 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, - <0 0 42 &gic GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + ranges = <0x8000000 0 0x8000000 0x18000000>; }; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 343ecf0e8973..06b36cc65865 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -405,9 +405,9 @@ interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>; clock-frequency = <0>; /* fixed up by bootloader */ clocks = <&clockgen QORIQ_CLK_HWACCEL 1>; - voltage-ranges = <1800 1800 3300 3300>; + voltage-ranges = <1800 1800>; sdhci,auto-cmd12; - broken-cd; + non-removable; little-endian; bus-width = <4>; status = "disabled"; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-beacon-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-beacon-som.dtsi index 988f8ab679ad..40f5e7a3b064 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-beacon-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-beacon-som.dtsi @@ -91,7 +91,7 @@ #size-cells = <1>; compatible = "jedec,spi-nor"; spi-max-frequency = <80000000>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; spi-rx-bus-width = <4>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-evk.dts b/arch/arm64/boot/dts/freescale/imx8mm-evk.dts index 4e2820d19244..a2b24d4d4e3e 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-evk.dts @@ -48,7 +48,7 @@ #size-cells = <1>; compatible = "jedec,spi-nor"; spi-max-frequency = <80000000>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; spi-rx-bus-width = <4>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts index d17abb515835..e99e7644ff39 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts @@ -70,7 +70,9 @@ regulator-name = "rst-usb-eth2"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usb_eth2>; - gpio = <&gpio3 2 GPIO_ACTIVE_LOW>; + gpio = <&gpio3 2 GPIO_ACTIVE_HIGH>; + enable-active-high; + regulator-always-on; }; reg_vdd_5v: regulator-5v { @@ -95,7 +97,7 @@ clocks = <&osc_can>; interrupt-parent = <&gpio4>; interrupts = <28 IRQ_TYPE_EDGE_FALLING>; - spi-max-frequency = <100000>; + spi-max-frequency = <10000000>; vdd-supply = <®_vdd_3v3>; xceiver-supply = <®_vdd_5v>; }; @@ -111,7 +113,7 @@ &fec1 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_enet>; - phy-connection-type = "rgmii"; + phy-connection-type = "rgmii-rxid"; phy-handle = <ðphy>; status = "okay"; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi index d0456daefda8..42bbbb3f532b 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-som.dtsi @@ -91,10 +91,12 @@ reg_vdd_soc: BUCK1 { regulator-name = "buck1"; regulator-min-microvolt = <800000>; - regulator-max-microvolt = <900000>; + regulator-max-microvolt = <850000>; regulator-boot-on; regulator-always-on; regulator-ramp-delay = <3125>; + nxp,dvs-run-voltage = <850000>; + nxp,dvs-standby-voltage = <800000>; }; reg_vdd_arm: BUCK2 { @@ -102,6 +104,7 @@ regulator-min-microvolt = <850000>; regulator-max-microvolt = <950000>; regulator-boot-on; + regulator-always-on; regulator-ramp-delay = <3125>; nxp,dvs-run-voltage = <950000>; nxp,dvs-standby-voltage = <850000>; @@ -110,7 +113,7 @@ reg_vdd_dram: BUCK3 { regulator-name = "buck3"; regulator-min-microvolt = <850000>; - regulator-max-microvolt = <900000>; + regulator-max-microvolt = <950000>; regulator-boot-on; regulator-always-on; }; @@ -149,7 +152,7 @@ reg_vdd_snvs: LDO2 { regulator-name = "ldo2"; - regulator-min-microvolt = <850000>; + regulator-min-microvolt = <800000>; regulator-max-microvolt = <900000>; regulator-boot-on; regulator-always-on; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts index 05cb60991fb9..d52686f4c059 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7902.dts @@ -647,7 +647,7 @@ pinctrl_hog: hoggrp { fsl,pins = < MX8MM_IOMUXC_NAND_CE0_B_GPIO3_IO1 0x40000159 /* M2_GDIS# */ - MX8MM_IOMUXC_GPIO1_IO12_GPIO1_IO12 0x40000041 /* M2_RST# */ + MX8MM_IOMUXC_GPIO1_IO13_GPIO1_IO13 0x40000041 /* M2_RST# */ MX8MM_IOMUXC_NAND_DATA01_GPIO3_IO7 0x40000119 /* M2_OFF# */ MX8MM_IOMUXC_GPIO1_IO15_GPIO1_IO15 0x40000159 /* M2_WDIS# */ MX8MM_IOMUXC_SAI1_TXD2_GPIO4_IO14 0x40000041 /* AMP GPIO1 */ diff --git a/arch/arm64/boot/dts/freescale/imx8mn-beacon-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mn-beacon-som.dtsi index 54eaf3d6055b..3b2d627a0342 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-beacon-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn-beacon-som.dtsi @@ -101,7 +101,7 @@ #size-cells = <1>; compatible = "jedec,spi-nor"; spi-max-frequency = <80000000>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; spi-rx-bus-width = <4>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts index e77db4996e58..236f425e1570 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts +++ b/arch/arm64/boot/dts/freescale/imx8mn-venice-gw7902.dts @@ -633,7 +633,7 @@ pinctrl_hog: hoggrp { fsl,pins = < MX8MN_IOMUXC_NAND_CE0_B_GPIO3_IO1 0x40000159 /* M2_GDIS# */ - MX8MN_IOMUXC_GPIO1_IO12_GPIO1_IO12 0x40000041 /* M2_RST# */ + MX8MN_IOMUXC_GPIO1_IO13_GPIO1_IO13 0x40000041 /* M2_RST# */ MX8MN_IOMUXC_NAND_DATA01_GPIO3_IO7 0x40000119 /* M2_OFF# */ MX8MN_IOMUXC_GPIO1_IO15_GPIO1_IO15 0x40000159 /* M2_WDIS# */ MX8MN_IOMUXC_SAI2_RXFS_GPIO4_IO21 0x40000041 /* APP GPIO1 */ diff --git a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi index aa78e0d8c72b..fc178eebf8aa 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi @@ -74,7 +74,7 @@ compatible = "jedec,spi-nor"; reg = <0>; spi-max-frequency = <80000000>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; spi-rx-bus-width = <4>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mq-evk.dts b/arch/arm64/boot/dts/freescale/imx8mq-evk.dts index 49f9db971f3b..b83df77195ec 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx8mq-evk.dts @@ -337,6 +337,8 @@ #size-cells = <1>; compatible = "micron,n25q256a", "jedec,spi-nor"; spi-max-frequency = <29000000>; + spi-tx-bus-width = <1>; + spi-rx-bus-width = <4>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mq-kontron-pitx-imx8m.dts b/arch/arm64/boot/dts/freescale/imx8mq-kontron-pitx-imx8m.dts index f593e4ff62e1..564746d5000d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-kontron-pitx-imx8m.dts +++ b/arch/arm64/boot/dts/freescale/imx8mq-kontron-pitx-imx8m.dts @@ -281,7 +281,7 @@ #address-cells = <1>; #size-cells = <1>; reg = <0>; - spi-tx-bus-width = <4>; + spi-tx-bus-width = <1>; spi-rx-bus-width = <4>; m25p,fast-read; spi-max-frequency = <50000000>; diff --git a/arch/arm64/boot/dts/qcom/pm8150.dtsi b/arch/arm64/boot/dts/qcom/pm8150.dtsi index c566a64b1373..0df76f7b1cc1 100644 --- a/arch/arm64/boot/dts/qcom/pm8150.dtsi +++ b/arch/arm64/boot/dts/qcom/pm8150.dtsi @@ -48,8 +48,10 @@ #size-cells = <0>; pon: power-on@800 { - compatible = "qcom,pm8916-pon"; + compatible = "qcom,pm8998-pon"; reg = <0x0800>; + mode-bootloader = <0x2>; + mode-recovery = <0x1>; pon_pwrkey: pwrkey { compatible = "qcom,pm8941-pwrkey"; diff --git a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts index 8ac96f8e79d4..28d5b5528516 100644 --- a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts +++ b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts @@ -804,6 +804,16 @@ }; }; +&pon_pwrkey { + status = "okay"; +}; + +&pon_resin { + status = "okay"; + + linux,code = <KEY_VOLUMEDOWN>; +}; + &qupv3_id_0 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi index 0f2b3c00e434..70c88c37de32 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi @@ -273,7 +273,6 @@ "Headphone Jack", "HPOL", "Headphone Jack", "HPOR"; - #sound-dai-cells = <0>; #address-cells = <1>; #size-cells = <0>; @@ -301,11 +300,11 @@ }; }; - dai-link@2 { + dai-link@5 { link-name = "MultiMedia2"; - reg = <2>; + reg = <LPASS_DP_RX>; cpu { - sound-dai = <&lpass_cpu 2>; + sound-dai = <&lpass_cpu LPASS_DP_RX>; }; codec { @@ -782,7 +781,7 @@ hp_i2c: &i2c9 { qcom,playback-sd-lines = <0>; }; - hdmi-primary@0 { + hdmi@5 { reg = <LPASS_DP_RX>; }; }; diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi index 53a21d086178..fd78f16181dd 100644 --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi @@ -1850,9 +1850,9 @@ cpufreq_hw: cpufreq@18591000 { compatible = "qcom,cpufreq-epss"; - reg = <0 0x18591100 0 0x900>, - <0 0x18592100 0 0x900>, - <0 0x18593100 0 0x900>; + reg = <0 0x18591000 0 0x1000>, + <0 0x18592000 0 0x1000>, + <0 0x18593000 0 0x1000>; clocks = <&rpmhcc RPMH_CXO_CLK>, <&gcc GCC_GPLL0>; clock-names = "xo", "alternate"; #freq-domain-cells = <1>; diff --git a/arch/arm64/boot/dts/qcom/sdm630.dtsi b/arch/arm64/boot/dts/qcom/sdm630.dtsi index 9153e6616ba4..9c7f87e42fcc 100644 --- a/arch/arm64/boot/dts/qcom/sdm630.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm630.dtsi @@ -654,9 +654,20 @@ compatible = "qcom,sdm660-a2noc"; reg = <0x01704000 0xc100>; #interconnect-cells = <1>; - clock-names = "bus", "bus_a"; + clock-names = "bus", + "bus_a", + "ipa", + "ufs_axi", + "aggre2_ufs_axi", + "aggre2_usb3_axi", + "cfg_noc_usb2_axi"; clocks = <&rpmcc RPM_SMD_AGGR2_NOC_CLK>, - <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>; + <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>, + <&rpmcc RPM_SMD_IPA_CLK>, + <&gcc GCC_UFS_AXI_CLK>, + <&gcc GCC_AGGRE2_UFS_AXI_CLK>, + <&gcc GCC_AGGRE2_USB3_AXI_CLK>, + <&gcc GCC_CFG_NOC_USB2_AXI_CLK>; }; mnoc: interconnect@1745000 { diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi index 6d7172e6f4c3..b3b911926184 100644 --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi @@ -128,23 +128,28 @@ no-map; }; - wlan_msa_mem: memory@8c400000 { - reg = <0 0x8c400000 0 0x100000>; + ipa_fw_mem: memory@8c400000 { + reg = <0 0x8c400000 0 0x10000>; no-map; }; - gpu_mem: memory@8c515000 { - reg = <0 0x8c515000 0 0x2000>; + ipa_gsi_mem: memory@8c410000 { + reg = <0 0x8c410000 0 0x5000>; no-map; }; - ipa_fw_mem: memory@8c517000 { - reg = <0 0x8c517000 0 0x5a000>; + gpu_mem: memory@8c415000 { + reg = <0 0x8c415000 0 0x2000>; no-map; }; - adsp_mem: memory@8c600000 { - reg = <0 0x8c600000 0 0x1a00000>; + adsp_mem: memory@8c500000 { + reg = <0 0x8c500000 0 0x1a00000>; + no-map; + }; + + wlan_msa_mem: memory@8df00000 { + reg = <0 0x8df00000 0 0x100000>; no-map; }; diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 385e5029437d..2ba23aa582a1 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -16,6 +16,17 @@ #include "sdm850.dtsi" #include "pm8998.dtsi" +/* + * Update following upstream (sdm845.dtsi) reserved + * memory mappings for firmware loading to succeed + * and enable the IPA device. + */ +/delete-node/ &ipa_fw_mem; +/delete-node/ &ipa_gsi_mem; +/delete-node/ &gpu_mem; +/delete-node/ &adsp_mem; +/delete-node/ &wlan_msa_mem; + / { model = "Lenovo Yoga C630"; compatible = "lenovo,yoga-c630", "qcom,sdm845"; @@ -58,6 +69,29 @@ }; }; + /* Reserved memory changes for IPA */ + reserved-memory { + wlan_msa_mem: memory@8c400000 { + reg = <0 0x8c400000 0 0x100000>; + no-map; + }; + + gpu_mem: memory@8c515000 { + reg = <0 0x8c515000 0 0x2000>; + no-map; + }; + + ipa_fw_mem: memory@8c517000 { + reg = <0 0x8c517000 0 0x5a000>; + no-map; + }; + + adsp_mem: memory@8c600000 { + reg = <0 0x8c600000 0 0x1a00000>; + no-map; + }; + }; + sn65dsi86_refclk: sn65dsi86-refclk { compatible = "fixed-clock"; #clock-cells = <0>; diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi index 8c15d9fed08f..d12e4cbfc852 100644 --- a/arch/arm64/boot/dts/qcom/sm8250.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi @@ -2590,9 +2590,10 @@ power-domains = <&dispcc MDSS_GDSC>; clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>, + <&gcc GCC_DISP_HF_AXI_CLK>, <&gcc GCC_DISP_SF_AXI_CLK>, <&dispcc DISP_CC_MDSS_MDP_CLK>; - clock-names = "iface", "nrt_bus", "core"; + clock-names = "iface", "bus", "nrt_bus", "core"; assigned-clocks = <&dispcc DISP_CC_MDSS_MDP_CLK>; assigned-clock-rates = <460000000>; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 156d96afbbfc..545197bc0501 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -245,7 +245,6 @@ CONFIG_DEVTMPFS_MOUNT=y CONFIG_FW_LOADER_USER_HELPER=y CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y CONFIG_HISILICON_LPC=y -CONFIG_SIMPLE_PM_BUS=y CONFIG_FSL_MC_BUS=y CONFIG_TEGRA_ACONNECT=m CONFIG_GNSS=m diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h index 88d20f04c64a..519ac1f7f859 100644 --- a/arch/arm64/include/asm/arch_timer.h +++ b/arch/arm64/include/asm/arch_timer.h @@ -32,7 +32,7 @@ ({ \ const struct arch_timer_erratum_workaround *__wa; \ __wa = __this_cpu_read(timer_unstable_counter_workaround); \ - (__wa && __wa->h) ? __wa->h : arch_timer_##h; \ + (__wa && __wa->h) ? ({ isb(); __wa->h;}) : arch_timer_##h; \ }) #else @@ -52,8 +52,6 @@ struct arch_timer_erratum_workaround { enum arch_timer_erratum_match_type match_type; const void *id; const char *desc; - u32 (*read_cntp_tval_el0)(void); - u32 (*read_cntv_tval_el0)(void); u64 (*read_cntpct_el0)(void); u64 (*read_cntvct_el0)(void); int (*set_next_event_phys)(unsigned long, struct clock_event_device *); @@ -64,24 +62,15 @@ struct arch_timer_erratum_workaround { DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround); -/* inline sysreg accessors that make erratum_handler() work */ -static inline notrace u32 arch_timer_read_cntp_tval_el0(void) -{ - return read_sysreg(cntp_tval_el0); -} - -static inline notrace u32 arch_timer_read_cntv_tval_el0(void) -{ - return read_sysreg(cntv_tval_el0); -} - static inline notrace u64 arch_timer_read_cntpct_el0(void) { + isb(); return read_sysreg(cntpct_el0); } static inline notrace u64 arch_timer_read_cntvct_el0(void) { + isb(); return read_sysreg(cntvct_el0); } @@ -102,51 +91,58 @@ static inline notrace u64 arch_timer_read_cntvct_el0(void) * the code. */ static __always_inline -void arch_timer_reg_write_cp15(int access, enum arch_timer_reg reg, u32 val) +void arch_timer_reg_write_cp15(int access, enum arch_timer_reg reg, u64 val) { if (access == ARCH_TIMER_PHYS_ACCESS) { switch (reg) { case ARCH_TIMER_REG_CTRL: write_sysreg(val, cntp_ctl_el0); + isb(); break; - case ARCH_TIMER_REG_TVAL: - write_sysreg(val, cntp_tval_el0); + case ARCH_TIMER_REG_CVAL: + write_sysreg(val, cntp_cval_el0); break; + default: + BUILD_BUG(); } } else if (access == ARCH_TIMER_VIRT_ACCESS) { switch (reg) { case ARCH_TIMER_REG_CTRL: write_sysreg(val, cntv_ctl_el0); + isb(); break; - case ARCH_TIMER_REG_TVAL: - write_sysreg(val, cntv_tval_el0); + case ARCH_TIMER_REG_CVAL: + write_sysreg(val, cntv_cval_el0); break; + default: + BUILD_BUG(); } + } else { + BUILD_BUG(); } - - isb(); } static __always_inline -u32 arch_timer_reg_read_cp15(int access, enum arch_timer_reg reg) +u64 arch_timer_reg_read_cp15(int access, enum arch_timer_reg reg) { if (access == ARCH_TIMER_PHYS_ACCESS) { switch (reg) { case ARCH_TIMER_REG_CTRL: return read_sysreg(cntp_ctl_el0); - case ARCH_TIMER_REG_TVAL: - return arch_timer_reg_read_stable(cntp_tval_el0); + default: + BUILD_BUG(); } } else if (access == ARCH_TIMER_VIRT_ACCESS) { switch (reg) { case ARCH_TIMER_REG_CTRL: return read_sysreg(cntv_ctl_el0); - case ARCH_TIMER_REG_TVAL: - return arch_timer_reg_read_stable(cntv_tval_el0); + default: + BUILD_BUG(); } } - BUG(); + BUILD_BUG(); + unreachable(); } static inline u32 arch_timer_get_cntfrq(void) @@ -169,7 +165,6 @@ static __always_inline u64 __arch_counter_get_cntpct_stable(void) { u64 cnt; - isb(); cnt = arch_timer_reg_read_stable(cntpct_el0); arch_counter_enforce_ordering(cnt); return cnt; @@ -189,7 +184,6 @@ static __always_inline u64 __arch_counter_get_cntvct_stable(void) { u64 cnt; - isb(); cnt = arch_timer_reg_read_stable(cntvct_el0); arch_counter_enforce_ordering(cnt); return cnt; diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 3cb206aea3db..6bdb5f5db438 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 449 +#define __NR_compat_syscalls 450 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 844f6ae58662..41ea1195e44b 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -903,6 +903,8 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #define __NR_process_mrelease 448 __SYSCALL(__NR_process_mrelease, sys_process_mrelease) +#define __NR_futex_waitv 449 +__SYSCALL(__NR_futex_waitv, sys_futex_waitv) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 32f9796c4ffe..f7408edf8571 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -17,6 +17,7 @@ #include <asm/daifflags.h> #include <asm/esr.h> #include <asm/exception.h> +#include <asm/irq_regs.h> #include <asm/kprobes.h> #include <asm/mmu.h> #include <asm/processor.h> @@ -219,22 +220,6 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) lockdep_hardirqs_on(CALLER_ADDR0); } -static void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) - arm64_enter_nmi(regs); - else - enter_from_kernel_mode(regs); -} - -static void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) - arm64_exit_nmi(regs); - else - exit_to_kernel_mode(regs); -} - static void __sched arm64_preempt_schedule_irq(void) { lockdep_assert_irqs_disabled(); @@ -263,10 +248,14 @@ static void __sched arm64_preempt_schedule_irq(void) static void do_interrupt_handler(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { + struct pt_regs *old_regs = set_irq_regs(regs); + if (on_thread_stack()) call_on_irq_stack(regs, handler); else handler(regs); + + set_irq_regs(old_regs); } extern void (*handle_arch_irq)(struct pt_regs *); @@ -432,13 +421,22 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) } } -static void noinstr el1_interrupt(struct pt_regs *regs, - void (*handler)(struct pt_regs *)) +static __always_inline void __el1_pnmi(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) { - write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + arm64_enter_nmi(regs); + do_interrupt_handler(regs, handler); + arm64_exit_nmi(regs); +} + +static __always_inline void __el1_irq(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + enter_from_kernel_mode(regs); - enter_el1_irq_or_nmi(regs); + irq_enter_rcu(); do_interrupt_handler(regs, handler); + irq_exit_rcu(); /* * Note: thread_info::preempt_count includes both thread_info::count @@ -449,7 +447,17 @@ static void noinstr el1_interrupt(struct pt_regs *regs, READ_ONCE(current_thread_info()->preempt_count) == 0) arm64_preempt_schedule_irq(); - exit_el1_irq_or_nmi(regs); + exit_to_kernel_mode(regs); +} +static void noinstr el1_interrupt(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + __el1_pnmi(regs, handler); + else + __el1_irq(regs, handler); } asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs) @@ -667,7 +675,9 @@ static void noinstr el0_interrupt(struct pt_regs *regs, if (regs->pc & BIT(55)) arm64_apply_bp_hardening(); + irq_enter_rcu(); do_interrupt_handler(regs, handler); + irq_exit_rcu(); exit_to_user_mode(regs); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index fb0f523d1492..0a048dc06a7d 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -24,6 +24,7 @@ struct hyp_pool { /* Allocation */ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index bacd493a4eac..34eeb524b686 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -35,7 +35,18 @@ const u8 pkvm_hyp_id = 1; static void *host_s2_zalloc_pages_exact(size_t size) { - return hyp_alloc_pages(&host_s2_pool, get_order(size)); + void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); + + hyp_split_page(hyp_virt_to_page(addr)); + + /* + * The size of concatenated PGDs is always a power of two of PAGE_SIZE, + * so there should be no need to free any of the tail pages to make the + * allocation exact. + */ + WARN_ON(size != (PAGE_SIZE << get_order(size))); + + return addr; } static void *host_s2_zalloc_page(void *pool) diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 41fc25bdfb34..0bd7701ad1df 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -152,6 +152,7 @@ static inline void hyp_page_ref_inc(struct hyp_page *p) static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) { + BUG_ON(!p->refcount); p->refcount--; return (p->refcount == 0); } @@ -193,6 +194,20 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) hyp_spin_unlock(&pool->lock); } +void hyp_split_page(struct hyp_page *p) +{ + unsigned short order = p->order; + unsigned int i; + + p->order = 0; + for (i = 1; i < (1 << order); i++) { + struct hyp_page *tail = p + i; + + tail->order = 0; + hyp_set_page_refcounted(tail); + } +} + void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) { unsigned short i = order; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 1a94a7ca48f2..69bd1732a299 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1529,8 +1529,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * when updating the PG_mte_tagged page flag, see * sanitise_mte_tags for more details. */ - if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) - return -EINVAL; + if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) { + ret = -EINVAL; + break; + } if (vma->vm_flags & VM_PFNMAP) { /* IO region dirty page logging not allowed */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 23505fc35324..a8158c948966 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -43,7 +43,7 @@ void __init arm64_hugetlb_cma_reserve(void) #ifdef CONFIG_ARM64_4K_PAGES order = PUD_SHIFT - PAGE_SHIFT; #else - order = CONT_PMD_SHIFT + PMD_SHIFT - PAGE_SHIFT; + order = CONT_PMD_SHIFT - PAGE_SHIFT; #endif /* * HugeTLB CMA reservation is required for gigantic diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 41c23f474ea6..803e7773fa86 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1136,6 +1136,11 @@ out: return prog; } +u64 bpf_jit_alloc_exec_limit(void) +{ + return BPF_JIT_REGION_SIZE; +} + void *bpf_jit_alloc_exec(unsigned long size) { return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 9d4d898df76b..aed2b3e734ee 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -8,7 +8,7 @@ config CSKY select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS - select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 + select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace) select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select COMMON_CLK select CLKSRC_MMIO @@ -17,7 +17,6 @@ config CSKY select CSKY_APB_INTC select DMA_DIRECT_REMAP select IRQ_DOMAIN - select HANDLE_DOMAIN_IRQ select DW_APB_TIMER_OF select GENERIC_IOREMAP select GENERIC_LIB_ASHLDI3 @@ -241,6 +240,7 @@ endchoice menuconfig HAVE_TCM bool "Tightly-Coupled/Sram Memory" + depends on !COMPILE_TEST help The implementation are not only used by TCM (Tightly-Coupled Meory) but also used by sram on SOC bus. It follow existed linux tcm diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 91818787d860..02b72a000767 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -74,7 +74,6 @@ static __always_inline unsigned long __fls(unsigned long x) * bug fix, why only could use atomic!!!! */ #include <asm-generic/bitops/non-atomic.h> -#define __clear_bit(nr, vaddr) clear_bit(nr, vaddr) #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic.h> diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index 00e3c8ebf9b8..a4ababf25e24 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -249,7 +249,7 @@ ENTRY(csky_irq) mov a0, sp - jbsr csky_do_IRQ + jbsr generic_handle_arch_irq jmpi ret_from_exception diff --git a/arch/csky/kernel/irq.c b/arch/csky/kernel/irq.c index 03a1930f1cbb..fcdaf3156286 100644 --- a/arch/csky/kernel/irq.c +++ b/arch/csky/kernel/irq.c @@ -15,8 +15,3 @@ void __init init_IRQ(void) setup_smp_ipi(); #endif } - -asmlinkage void __irq_entry csky_do_IRQ(struct pt_regs *regs) -{ - handle_arch_irq(regs); -} diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c index 0105ac81b432..1a5f54e0d272 100644 --- a/arch/csky/kernel/ptrace.c +++ b/arch/csky/kernel/ptrace.c @@ -99,7 +99,8 @@ static int gpr_set(struct task_struct *target, if (ret) return ret; - regs.sr = task_pt_regs(target)->sr; + /* BIT(0) of regs.sr is Condition Code/Carry bit */ + regs.sr = (regs.sr & BIT(0)) | (task_pt_regs(target)->sr & ~BIT(0)); #ifdef CONFIG_CPU_HAS_HILO regs.dcsr = task_pt_regs(target)->dcsr; #endif diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index bc4238b9f709..c7b763d2f526 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -52,10 +52,14 @@ static long restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { int err = 0; + unsigned long sr = regs->sr; /* sc_pt_regs is structured the same as the start of pt_regs */ err |= __copy_from_user(regs, &sc->sc_pt_regs, sizeof(struct pt_regs)); + /* BIT(0) of regs->sr is Condition Code/Carry bit */ + regs->sr = (sr & ~1) | (regs->sr & 1); + /* Restore the floating-point state. */ err |= restore_fpu_state(sc); diff --git a/arch/h8300/include/asm/irq.h b/arch/h8300/include/asm/irq.h index 5fc5b436dde9..776cf06d7a59 100644 --- a/arch/h8300/include/asm/irq.h +++ b/arch/h8300/include/asm/irq.h @@ -2,8 +2,6 @@ #ifndef _H8300_IRQ_H_ #define _H8300_IRQ_H_ -#include <linux/irqchip.h> - #if defined(CONFIG_CPU_H8300H) #define NR_IRQS 64 #define IRQ_CHIP h8300h_irq_chip diff --git a/arch/h8300/kernel/irq.c b/arch/h8300/kernel/irq.c index 834e4d7b1bcf..8ad6d702cd0b 100644 --- a/arch/h8300/kernel/irq.c +++ b/arch/h8300/kernel/irq.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/irqchip.h> #include <linux/irqdomain.h> #include <linux/of_irq.h> #include <asm/traps.h> diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 045792cde481..1e33666fa679 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -388,8 +388,6 @@ config CRASH_DUMP help Generate crash dump after being started by kexec. -source "drivers/firmware/Kconfig" - endmenu menu "Power management and ACPI options" diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index 864775970c50..0e5c1ad3239c 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -124,18 +124,13 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) __ticket_spin_unlock(lock); } -static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, - unsigned long flags) -{ - arch_spin_lock(lock); -} -#define arch_spin_lock_flags arch_spin_lock_flags - #ifdef ASM_SUPPORTED static __always_inline void -arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags) +arch_read_lock(arch_rwlock_t *lock) { + unsigned long flags = 0; + __asm__ __volatile__ ( "tbit.nz p6, p0 = %1,%2\n" "br.few 3f\n" @@ -157,13 +152,8 @@ arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags) : "p6", "p7", "r2", "memory"); } -#define arch_read_lock_flags arch_read_lock_flags -#define arch_read_lock(lock) arch_read_lock_flags(lock, 0) - #else /* !ASM_SUPPORTED */ -#define arch_read_lock_flags(rw, flags) arch_read_lock(rw) - #define arch_read_lock(rw) \ do { \ arch_rwlock_t *__read_lock_ptr = (rw); \ @@ -186,8 +176,10 @@ do { \ #ifdef ASM_SUPPORTED static __always_inline void -arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags) +arch_write_lock(arch_rwlock_t *lock) { + unsigned long flags = 0; + __asm__ __volatile__ ( "tbit.nz p6, p0 = %1, %2\n" "mov ar.ccv = r0\n" @@ -210,9 +202,6 @@ arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags) : "ar.ccv", "p6", "p7", "r2", "r29", "memory"); } -#define arch_write_lock_flags arch_write_lock_flags -#define arch_write_lock(rw) arch_write_lock_flags(rw, 0) - #define arch_write_trylock(rw) \ ({ \ register long result; \ diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 9a8394e96388..9c57b245dc12 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -58,7 +58,7 @@ struct nfhd_device { struct gendisk *disk; }; -static blk_qc_t nfhd_submit_bio(struct bio *bio) +static void nfhd_submit_bio(struct bio *bio) { struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; @@ -76,7 +76,6 @@ static blk_qc_t nfhd_submit_bio(struct bio *bio) sec += len; } bio_endio(bio); - return BLK_QC_T_NONE; } static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -100,6 +99,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) { struct nfhd_device *dev; int dev_id = id - NFHD_DEV_OFFSET; + int err = -ENOMEM; pr_info("nfhd%u: found device with %u blocks (%u bytes)\n", dev_id, blocks, bsize); @@ -130,16 +130,20 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) sprintf(dev->disk->disk_name, "nfhd%u", dev_id); set_capacity(dev->disk, (sector_t)blocks * (bsize / 512)); blk_queue_logical_block_size(dev->disk->queue, bsize); - add_disk(dev->disk); + err = add_disk(dev->disk); + if (err) + goto out_cleanup_disk; list_add_tail(&dev->list, &nfhd_list); return 0; +out_cleanup_disk: + blk_cleanup_disk(dev->disk); free_dev: kfree(dev); out: - return -ENOMEM; + return err; } static int __init nfhd_init(void) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 1ac55e7b47f0..8ab46625ddd3 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -250,6 +250,7 @@ static inline void __flush_page_to_ram(void *vaddr) #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) __flush_page_to_ram(page_address(page)) +void flush_dcache_folio(struct folio *folio); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_page(vma, page) __flush_page_to_ram(page_address(page)) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 771ca53af06d..95fc35669860 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -47,7 +47,6 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GUP_GET_PTE_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT - select HANDLE_DOMAIN_IRQ select HAVE_ARCH_COMPILER_H select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB if MIPS_FP_SUPPORT @@ -1782,6 +1781,7 @@ config CPU_BMIPS select CPU_HAS_PREFETCH select CPU_SUPPORTS_CPUFREQ select MIPS_EXTERNAL_TIMER + select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU help Support for BMIPS32/3300/4350/4380 and BMIPS5000 processors. @@ -3316,8 +3316,6 @@ source "drivers/cpuidle/Kconfig" endmenu -source "drivers/firmware/Kconfig" - source "arch/mips/kvm/Kconfig" source "arch/mips/vdso/Kconfig" diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index be5d4afcd30f..844f882096e6 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -2609,7 +2609,10 @@ static void octeon_irq_ciu3_ip2(void) else hw = intsn; - ret = handle_domain_irq(domain, hw, NULL); + irq_enter(); + ret = generic_handle_domain_irq(domain, hw); + irq_exit(); + if (ret < 0) { union cvmx_ciu3_iscx_w1c isc_w1c; u64 isc_w1c_addr = ciu3_addr + CIU3_ISC_W1C(intsn); diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index b3dc9c589442..f207388541d5 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -61,6 +61,8 @@ static inline void flush_dcache_page(struct page *page) SetPageDcacheDirty(page); } +void flush_dcache_folio(struct folio *folio); + #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/mips/include/asm/mips-cps.h b/arch/mips/include/asm/mips-cps.h index 35fb8ee6dd33..fd43d876892e 100644 --- a/arch/mips/include/asm/mips-cps.h +++ b/arch/mips/include/asm/mips-cps.h @@ -10,8 +10,6 @@ #include <linux/io.h> #include <linux/types.h> -#include <asm/mips-boards/launch.h> - extern unsigned long __cps_access_bad_size(void) __compiletime_error("Bad size for CPS accessor"); @@ -167,30 +165,11 @@ static inline uint64_t mips_cps_cluster_config(unsigned int cluster) */ static inline unsigned int mips_cps_numcores(unsigned int cluster) { - unsigned int ncores; - if (!mips_cm_present()) return 0; /* Add one before masking to handle 0xff indicating no cores */ - ncores = (mips_cps_cluster_config(cluster) + 1) & CM_GCR_CONFIG_PCORES; - - if (IS_ENABLED(CONFIG_SOC_MT7621)) { - struct cpulaunch *launch; - - /* - * Ralink MT7621S SoC is single core, but the GCR_CONFIG method - * always reports 2 cores. Check the second core's LAUNCH_FREADY - * flag to detect if the second core is missing. This method - * only works before the core has been started. - */ - launch = (struct cpulaunch *)CKSEG0ADDR(CPULAUNCH); - launch += 2; /* MT7621 has 2 VPEs per core */ - if (!(launch->flags & LAUNCH_FREADY)) - ncores = 1; - } - - return ncores; + return (mips_cps_cluster_config(cluster) + 1) & CM_GCR_CONFIG_PCORES; } /** diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c index d20e002b3246..5e11582fe308 100644 --- a/arch/mips/kernel/irq.c +++ b/arch/mips/kernel/irq.c @@ -111,15 +111,9 @@ void __irq_entry do_IRQ(unsigned int irq) #ifdef CONFIG_IRQ_DOMAIN void __irq_entry do_domain_IRQ(struct irq_domain *domain, unsigned int hwirq) { - struct irq_desc *desc; - irq_enter(); check_stack_overflow(); - - desc = irq_resolve_mapping(domain, hwirq); - if (likely(desc)) - handle_irq_desc(desc); - + generic_handle_domain_irq(domain, hwirq); irq_exit(); } #endif diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c index b6ef5f7312cf..f5d7bfa3472a 100644 --- a/arch/mips/kernel/smp-bmips.c +++ b/arch/mips/kernel/smp-bmips.c @@ -26,6 +26,7 @@ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/kexec.h> +#include <linux/irq.h> #include <asm/time.h> #include <asm/processor.h> @@ -373,7 +374,7 @@ static int bmips_cpu_disable(void) set_cpu_online(cpu, false); calculate_cpu_foreign_map(); - irq_cpu_offline(); + irq_migrate_all_off_this_cpu(); clear_c0_status(IE_IRQ5); local_flush_tlb_all(); diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c index 09ebe84a17fe..660e1de4412a 100644 --- a/arch/mips/loongson64/smp.c +++ b/arch/mips/loongson64/smp.c @@ -550,7 +550,6 @@ static int loongson3_cpu_disable(void) set_cpu_online(cpu, false); calculate_cpu_foreign_map(); local_irq_save(flags); - irq_cpu_offline(); clear_c0_status(ST0_IM); local_irq_restore(flags); local_flush_tlb_all(); diff --git a/arch/mips/rb532/prom.c b/arch/mips/rb532/prom.c index 23ad8dd9aa5e..b11693715547 100644 --- a/arch/mips/rb532/prom.c +++ b/arch/mips/rb532/prom.c @@ -16,7 +16,6 @@ #include <linux/console.h> #include <linux/memblock.h> #include <linux/ioport.h> -#include <linux/blkdev.h> #include <asm/bootinfo.h> #include <asm/mach-rc32434/ddr.h> diff --git a/arch/mips/sibyte/common/cfe.c b/arch/mips/sibyte/common/cfe.c index a3323f8dcc1b..1a504294d85f 100644 --- a/arch/mips/sibyte/common/cfe.c +++ b/arch/mips/sibyte/common/cfe.c @@ -7,7 +7,6 @@ #include <linux/kernel.h> #include <linux/linkage.h> #include <linux/mm.h> -#include <linux/blkdev.h> #include <linux/memblock.h> #include <linux/pm.h> #include <linux/smp.h> diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c index 538a2791b48c..f07b15dd1c1a 100644 --- a/arch/mips/sibyte/swarm/setup.c +++ b/arch/mips/sibyte/swarm/setup.c @@ -11,7 +11,6 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/memblock.h> -#include <linux/blkdev.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/screen_info.h> diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index aea26e739543..4d1421b18734 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -27,7 +27,6 @@ config NDS32 select GENERIC_LIB_MULDI3 select GENERIC_LIB_UCMPDI2 select GENERIC_TIME_VSYSCALL - select HANDLE_DOMAIN_IRQ select HAVE_ARCH_TRACEHOOK select HAVE_DEBUG_KMEMLEAK select HAVE_EXIT_THREAD diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index c2a222ebfa2a..3fc0bb7d6487 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -27,6 +27,7 @@ void flush_cache_vunmap(unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len); void copy_from_user_page(struct vm_area_struct *vma, struct page *page, diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 0e23e3a8df6b..d55b73b18149 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -6,7 +6,7 @@ #ifndef CONFIG_DYNAMIC_FTRACE extern void (*ftrace_trace_function)(unsigned long, unsigned long, - struct ftrace_ops*, struct pt_regs*); + struct ftrace_ops*, struct ftrace_regs*); extern void ftrace_graph_caller(void); noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip, diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 18eb9f69f806..1999561b22aa 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -28,7 +28,8 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, extern void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *page); +void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); extern void flush_icache_range(unsigned long start, unsigned long end); extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); diff --git a/arch/nios2/include/asm/irqflags.h b/arch/nios2/include/asm/irqflags.h index b3ec3e510706..25acf27862f9 100644 --- a/arch/nios2/include/asm/irqflags.h +++ b/arch/nios2/include/asm/irqflags.h @@ -9,7 +9,7 @@ static inline unsigned long arch_local_save_flags(void) { - return RDCTL(CTL_STATUS); + return RDCTL(CTL_FSTATUS); } /* @@ -18,7 +18,7 @@ static inline unsigned long arch_local_save_flags(void) */ static inline void arch_local_irq_restore(unsigned long flags) { - WRCTL(CTL_STATUS, flags); + WRCTL(CTL_FSTATUS, flags); } static inline void arch_local_irq_disable(void) diff --git a/arch/nios2/include/asm/registers.h b/arch/nios2/include/asm/registers.h index 183c720e454d..95b67dd16f81 100644 --- a/arch/nios2/include/asm/registers.h +++ b/arch/nios2/include/asm/registers.h @@ -11,7 +11,7 @@ #endif /* control register numbers */ -#define CTL_STATUS 0 +#define CTL_FSTATUS 0 #define CTL_ESTATUS 1 #define CTL_BSTATUS 2 #define CTL_IENABLE 3 diff --git a/arch/nios2/platform/Kconfig.platform b/arch/nios2/platform/Kconfig.platform index 9e32fb7f3d4c..e849daff6fd1 100644 --- a/arch/nios2/platform/Kconfig.platform +++ b/arch/nios2/platform/Kconfig.platform @@ -37,6 +37,7 @@ config NIOS2_DTB_PHYS_ADDR config NIOS2_DTB_SOURCE_BOOL bool "Compile and link device tree into kernel image" + depends on !COMPILE_TEST help This allows you to specify a dts (device tree source) file which will be compiled and linked into the kernel image. diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index e804026b4797..c2491b295d60 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -13,7 +13,6 @@ config OPENRISC select OF select OF_EARLY_FLATTREE select IRQ_DOMAIN - select HANDLE_DOMAIN_IRQ select GPIOLIB select HAVE_ARCH_TRACEHOOK select SPARSE_IRQ diff --git a/arch/openrisc/include/asm/spinlock.h b/arch/openrisc/include/asm/spinlock.h index a8940bdfcb7e..264944a71535 100644 --- a/arch/openrisc/include/asm/spinlock.h +++ b/arch/openrisc/include/asm/spinlock.h @@ -19,9 +19,6 @@ #include <asm/qrwlock.h> -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - #define arch_spin_relax(lock) cpu_relax() #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index edaa775a648e..59c6d3aa7081 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -569,8 +569,8 @@ EXCEPTION_ENTRY(_external_irq_handler) #endif CLEAR_LWA_FLAG(r3) l.addi r3,r1,0 - l.movhi r8,hi(do_IRQ) - l.ori r8,r8,lo(do_IRQ) + l.movhi r8,hi(generic_handle_arch_irq) + l.ori r8,r8,lo(generic_handle_arch_irq) l.jalr r8 l.nop l.j _ret_from_intr diff --git a/arch/openrisc/kernel/irq.c b/arch/openrisc/kernel/irq.c index c38fa863afa8..f38e10962a84 100644 --- a/arch/openrisc/kernel/irq.c +++ b/arch/openrisc/kernel/irq.c @@ -36,8 +36,3 @@ void __init init_IRQ(void) { irqchip_init(); } - -void __irq_entry do_IRQ(struct pt_regs *regs) -{ - handle_arch_irq(regs); -} diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index cfef61a7b6c2..97305bde1b16 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -25,7 +25,6 @@ #include <linux/memblock.h> #include <linux/init.h> #include <linux/delay.h> -#include <linux/blkdev.h> /* for initrd_* */ #include <linux/pagemap.h> #include <asm/pgalloc.h> diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 4742b6f169b7..27a8b49af11f 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -384,6 +384,4 @@ config KEXEC_FILE endmenu -source "drivers/firmware/Kconfig" - source "drivers/parisc/Kconfig" diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index eef0096db5f8..da0cd4b3a28f 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -49,7 +49,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size); #define flush_cache_vunmap(start, end) flush_cache_all() #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *page); +void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index fa5ee8a45dbd..a6e5d66a7656 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -23,21 +23,6 @@ static inline void arch_spin_lock(arch_spinlock_t *x) continue; } -static inline void arch_spin_lock_flags(arch_spinlock_t *x, - unsigned long flags) -{ - volatile unsigned int *a; - - a = __ldcw_align(x); - while (__ldcw(a) == 0) - while (*a == 0) - if (flags & PSW_SM_I) { - local_irq_enable(); - local_irq_disable(); - } -} -#define arch_spin_lock_flags arch_spin_lock_flags - static inline void arch_spin_unlock(arch_spinlock_t *x) { volatile unsigned int *a; diff --git a/arch/powerpc/boot/dts/fsl/t1023rdb.dts b/arch/powerpc/boot/dts/fsl/t1023rdb.dts index 5ba6fbfca274..f82f85c65964 100644 --- a/arch/powerpc/boot/dts/fsl/t1023rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1023rdb.dts @@ -154,7 +154,7 @@ fm1mac3: ethernet@e4000 { phy-handle = <&sgmii_aqr_phy3>; - phy-connection-type = "sgmii-2500"; + phy-connection-type = "2500base-x"; sleep = <&rcpm 0x20000000>; }; diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index d4b145b279f6..9f38040f0641 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -136,6 +136,14 @@ static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) if (kuap_is_disabled()) return; + if (unlikely(kuap != KUAP_NONE)) { + current->thread.kuap = KUAP_NONE; + kuap_lock(kuap, false); + } + + if (likely(regs->kuap == KUAP_NONE)) + return; + current->thread.kuap = regs->kuap; kuap_unlock(regs->kuap, false); diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index a95f63788c6b..4ba834599c4d 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -23,6 +23,7 @@ #define BRANCH_ABSOLUTE 0x2 bool is_offset_in_branch_range(long offset); +bool is_offset_in_cond_branch_range(long offset); int create_branch(struct ppc_inst *instr, const u32 *addr, unsigned long target, int flags); int create_cond_branch(struct ppc_inst *instr, const u32 *addr, diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 6b800d3e2681..a1d238255f07 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -265,13 +265,16 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte local_paca->irq_soft_mask = IRQS_ALL_DISABLED; local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - if (is_implicit_soft_masked(regs)) { - // Adjust regs->softe soft implicit soft-mask, so - // arch_irq_disabled_regs(regs) behaves as expected. + if (!(regs->msr & MSR_EE) || is_implicit_soft_masked(regs)) { + /* + * Adjust regs->softe to be soft-masked if it had not been + * reconcied (e.g., interrupt entry with MSR[EE]=0 but softe + * not yet set disabled), or if it was in an implicit soft + * masked state. This makes arch_irq_disabled_regs(regs) + * behave as expected. + */ regs->softe = IRQS_ALL_DISABLED; } - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) - BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE)); /* Don't do any per-CPU operations until interrupt state is fixed */ @@ -525,10 +528,9 @@ static __always_inline long ____##func(struct pt_regs *regs) /* kernel/traps.c */ DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception); #ifdef CONFIG_PPC_BOOK3S_64 -DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception); -#else -DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception); +DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async); #endif +DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception); DECLARE_INTERRUPT_HANDLER(SMIException); DECLARE_INTERRUPT_HANDLER(handle_hmi_exception); DECLARE_INTERRUPT_HANDLER(unknown_exception); diff --git a/arch/powerpc/include/asm/security_features.h b/arch/powerpc/include/asm/security_features.h index 792eefaf230b..27574f218b37 100644 --- a/arch/powerpc/include/asm/security_features.h +++ b/arch/powerpc/include/asm/security_features.h @@ -39,6 +39,11 @@ static inline bool security_ftr_enabled(u64 feature) return !!(powerpc_security_features & feature); } +#ifdef CONFIG_PPC_BOOK3S_64 +enum stf_barrier_type stf_barrier_type_get(void); +#else +static inline enum stf_barrier_type stf_barrier_type_get(void) { return STF_BARRIER_NONE; } +#endif // Features indicating support for Spectre/Meltdown mitigations diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h index 8985791a2ba5..7ae6aeef8464 100644 --- a/arch/powerpc/include/asm/simple_spinlock.h +++ b/arch/powerpc/include/asm/simple_spinlock.h @@ -123,27 +123,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) } } -static inline -void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - unsigned long flags_dis; - - while (1) { - if (likely(__arch_spin_trylock(lock) == 0)) - break; - local_save_flags(flags_dis); - local_irq_restore(flags); - do { - HMT_low(); - if (is_shared_processor()) - splpar_spin_yield(lock); - } while (unlikely(lock->slock != 0)); - HMT_medium(); - local_irq_restore(flags_dis); - } -} -#define arch_spin_lock_flags arch_spin_lock_flags - static inline void arch_spin_unlock(arch_spinlock_t *lock) { __asm__ __volatile__("# arch_spin_unlock\n\t" diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 111249fd619d..038ce8d9061d 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -184,6 +184,15 @@ u64 dma_iommu_get_required_mask(struct device *dev) struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; + if (dev_is_pci(dev)) { + u64 bypass_mask = dma_direct_get_required_mask(dev); + + if (dma_iommu_dma_supported(dev, bypass_mask)) { + dev_info(dev, "%s: returning bypass mask 0x%llx\n", __func__, bypass_mask); + return bypass_mask; + } + } + if (!tbl) return 0; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 37859e62a8dc..eaf1f72131a1 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common) li r10,MSR_RI mtmsrd r10,1 addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception + bl machine_check_exception_async b interrupt_return_srr @@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) subi r12,r12,1 sth r12,PACA_IN_MCE(r13) - /* Invoke machine_check_exception to print MCE event and panic. */ + /* + * Invoke machine_check_exception to print MCE event and panic. + * This is the NMI version of the handler because we are called from + * the early handler which is a true NMI. + */ addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception @@ -1665,27 +1669,30 @@ EXC_COMMON_BEGIN(program_check_common) */ andi. r10,r12,MSR_PR - bne 2f /* If userspace, go normal path */ + bne .Lnormal_stack /* If userspace, go normal path */ andis. r10,r12,(SRR1_PROGTM)@h - bne 1f /* If TM, emergency */ + bne .Lemergency_stack /* If TM, emergency */ cmpdi r1,-INT_FRAME_SIZE /* check if r1 is in userspace */ - blt 2f /* normal path if not */ + blt .Lnormal_stack /* normal path if not */ /* Use the emergency stack */ -1: andi. r10,r12,MSR_PR /* Set CR0 correctly for label */ +.Lemergency_stack: + andi. r10,r12,MSR_PR /* Set CR0 correctly for label */ /* 3 in EXCEPTION_PROLOG_COMMON */ mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ __ISTACK(program_check)=0 __GEN_COMMON_BODY program_check - b 3f -2: + b .Ldo_program_check + +.Lnormal_stack: __ISTACK(program_check)=1 __GEN_COMMON_BODY program_check -3: + +.Ldo_program_check: addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception REST_NVGPRS(r1) /* instruction emulation may change GPRs */ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index abb719b21cae..3d97fb833834 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -126,14 +126,16 @@ _GLOBAL(idle_return_gpr_loss) /* * This is the sequence required to execute idle instructions, as * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0. - * - * The 0(r1) slot is used to save r2 in isa206, so use that here. + * We have to store a GPR somewhere, ptesync, then reload it, and create + * a false dependency on the result of the load. It doesn't matter which + * GPR we store, or where we store it. We have already stored r2 to the + * stack at -8(r1) in isa206_idle_insn_mayloss, so use that. */ #define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ - std r2,0(r1); \ + std r2,-8(r1); \ ptesync; \ - ld r2,0(r1); \ + ld r2,-8(r1); \ 236: cmpd cr0,r2,r2; \ bne 236b; \ IDLE_INST; \ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 551b653228c4..c4f1d6b7d992 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -229,6 +229,9 @@ notrace void arch_local_irq_restore(unsigned long mask) return; } + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(in_nmi() || in_hardirq()); + /* * After the stb, interrupts are unmasked and there are no interrupts * pending replay. The restart sequence makes this atomic with @@ -321,6 +324,9 @@ notrace void arch_local_irq_restore(unsigned long mask) if (mask) return; + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(in_nmi() || in_hardirq()); + /* * From this point onward, we can take interrupts, preempt, * etc... unless we got hard-disabled. We check if an event diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 1a998490fe60..15fb5ea1b9ea 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -263,6 +263,11 @@ static int __init handle_no_stf_barrier(char *p) early_param("no_stf_barrier", handle_no_stf_barrier); +enum stf_barrier_type stf_barrier_type_get(void) +{ + return stf_enabled_flush_types; +} + /* This is the generic flag used by other architectures */ static int __init handle_ssbd(char *p) { diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 9cc7d3dbf439..605bab448f84 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1730,8 +1730,6 @@ void __cpu_die(unsigned int cpu) void arch_cpu_idle_dead(void) { - sched_preempt_enable_no_resched(); - /* * Disable on the down path. This will be re-enabled by * start_secondary() via start_secondary_resume() below diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index aac8c0412ff9..11741703d26e 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -340,10 +340,16 @@ static bool exception_common(int signr, struct pt_regs *regs, int code, return false; } - show_signal_msg(signr, regs, code, addr); + /* + * Must not enable interrupts even for user-mode exception, because + * this can be called from machine check, which may be a NMI or IRQ + * which don't like interrupts being enabled. Could check for + * in_hardirq || in_nmi perhaps, but there doesn't seem to be a good + * reason why _exception() should enable irqs for an exception handler, + * the handlers themselves do that directly. + */ - if (arch_irqs_disabled()) - interrupt_cond_local_irq_enable(regs); + show_signal_msg(signr, regs, code, addr); current->thread.trap_nr = code; @@ -790,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long err) * do_exit() checks for in_interrupt() and panics in that case, so * exit the irq/nmi before calling die. */ - if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) - irq_exit(); - else + if (in_nmi()) nmi_exit(); + else + irq_exit(); die(str, regs, err); } /* - * BOOK3S_64 does not call this handler as a non-maskable interrupt + * BOOK3S_64 does not usually call this handler as a non-maskable interrupt * (it uses its own early real-mode handler to handle the MCE proper * and then raises irq_work to call this handler when interrupts are - * enabled). + * enabled). The only time when this is not true is if the early handler + * is unrecoverable, then it does call this directly to try to get a + * message out. */ -#ifdef CONFIG_PPC_BOOK3S_64 -DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception) -#else -DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) -#endif +static void __machine_check_exception(struct pt_regs *regs) { int recover = 0; @@ -841,12 +845,19 @@ bail: /* Must die if the interrupt is not recoverable */ if (regs_is_unrecoverable(regs)) die_mce("Unrecoverable Machine check", regs, SIGBUS); +} #ifdef CONFIG_PPC_BOOK3S_64 - return; -#else - return 0; +DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async) +{ + __machine_check_exception(regs); +} #endif +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) +{ + __machine_check_exception(regs); + + return 0; } DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 90484425a1e6..eb776d0c5d8e 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -255,13 +255,16 @@ kvm_novcpu_exit: * r3 contains the SRR1 wakeup value, SRR1 is trashed. */ _GLOBAL(idle_kvm_start_guest) - ld r4,PACAEMERGSP(r13) mfcr r5 mflr r0 - std r1,0(r4) - std r5,8(r4) - std r0,16(r4) - subi r1,r4,STACK_FRAME_OVERHEAD + std r5, 8(r1) // Save CR in caller's frame + std r0, 16(r1) // Save LR in caller's frame + // Create frame on emergency stack + ld r4, PACAEMERGSP(r13) + stdu r1, -SWITCH_FRAME_SIZE(r4) + // Switch to new frame on emergency stack + mr r1, r4 + std r3, 32(r1) // Save SRR1 wakeup value SAVE_NVGPRS(r1) /* @@ -313,6 +316,10 @@ kvm_unsplit_wakeup: kvm_secondary_got_guest: + // About to go to guest, clear saved SRR1 + li r0, 0 + std r0, 32(r1) + /* Set HSTATE_DSCR(r13) to something sensible */ ld r6, PACA_DSCR_DEFAULT(r13) std r6, HSTATE_DSCR(r13) @@ -392,13 +399,12 @@ kvm_no_guest: mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 - /* set up r3 for return */ - mfspr r3,SPRN_SRR1 + // Return SRR1 wakeup value, or 0 if we went into the guest + ld r3, 32(r1) REST_NVGPRS(r1) - addi r1, r1, STACK_FRAME_OVERHEAD - ld r0, 16(r1) - ld r5, 8(r1) - ld r1, 0(r1) + ld r1, 0(r1) // Switch back to caller stack + ld r0, 16(r1) // Reload LR + ld r5, 8(r1) // Reload CR mtlr r0 mtcr r5 blr diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index f9a3019e37b4..c5ed98823835 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -228,6 +228,11 @@ bool is_offset_in_branch_range(long offset) return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3)); } +bool is_offset_in_cond_branch_range(long offset) +{ + return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3); +} + /* * Helper to check if a given instruction is a conditional branch * Derived from the conditional checks in analyse_instr() @@ -280,7 +285,7 @@ int create_cond_branch(struct ppc_inst *instr, const u32 *addr, offset = offset - (unsigned long)addr; /* Check we can represent the target in the instruction format */ - if (offset < -0x8000 || offset > 0x7FFF || offset & 0x3) + if (!is_offset_in_cond_branch_range(offset)) return 1; /* Mask out the flags and target, so they don't step on each other. */ diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 99fad093f43e..7e9b978b768e 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -24,16 +24,30 @@ #define EMIT(instr) PLANT_INSTR(image, ctx->idx, instr) /* Long jump; (unconditional 'branch') */ -#define PPC_JMP(dest) EMIT(PPC_INST_BRANCH | \ - (((dest) - (ctx->idx * 4)) & 0x03fffffc)) +#define PPC_JMP(dest) \ + do { \ + long offset = (long)(dest) - (ctx->idx * 4); \ + if (!is_offset_in_branch_range(offset)) { \ + pr_err_ratelimited("Branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \ + return -ERANGE; \ + } \ + EMIT(PPC_INST_BRANCH | (offset & 0x03fffffc)); \ + } while (0) + /* blr; (unconditional 'branch' with link) to absolute address */ #define PPC_BL_ABS(dest) EMIT(PPC_INST_BL | \ (((dest) - (unsigned long)(image + ctx->idx)) & 0x03fffffc)) /* "cond" here covers BO:BI fields. */ -#define PPC_BCC_SHORT(cond, dest) EMIT(PPC_INST_BRANCH_COND | \ - (((cond) & 0x3ff) << 16) | \ - (((dest) - (ctx->idx * 4)) & \ - 0xfffc)) +#define PPC_BCC_SHORT(cond, dest) \ + do { \ + long offset = (long)(dest) - (ctx->idx * 4); \ + if (!is_offset_in_cond_branch_range(offset)) { \ + pr_err_ratelimited("Conditional branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \ + return -ERANGE; \ + } \ + EMIT(PPC_INST_BRANCH_COND | (((cond) & 0x3ff) << 16) | (offset & 0xfffc)); \ + } while (0) + /* Sign-extended 32-bit immediate load */ #define PPC_LI32(d, i) do { \ if ((int)(uintptr_t)(i) >= -32768 && \ @@ -78,11 +92,6 @@ #define PPC_FUNC_ADDR(d,i) do { PPC_LI32(d, i); } while(0) #endif -static inline bool is_nearbranch(int offset) -{ - return (offset < 32768) && (offset >= -32768); -} - /* * The fly in the ointment of code size changing from pass to pass is * avoided by padding the short branch case with a NOP. If code size differs @@ -91,7 +100,7 @@ static inline bool is_nearbranch(int offset) * state. */ #define PPC_BCC(cond, dest) do { \ - if (is_nearbranch((dest) - (ctx->idx * 4))) { \ + if (is_offset_in_cond_branch_range((long)(dest) - (ctx->idx * 4))) { \ PPC_BCC_SHORT(cond, dest); \ EMIT(PPC_RAW_NOP()); \ } else { \ diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index 7b713edfa7e2..b63b35e45e55 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -16,18 +16,18 @@ * with our redzone usage. * * [ prev sp ] <------------- - * [ nv gpr save area ] 6*8 | + * [ nv gpr save area ] 5*8 | * [ tail_call_cnt ] 8 | - * [ local_tmp_var ] 8 | + * [ local_tmp_var ] 16 | * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ /* for gpr non volatile registers BPG_REG_6 to 10 */ -#define BPF_PPC_STACK_SAVE (6*8) +#define BPF_PPC_STACK_SAVE (5*8) /* for bpf JIT code internal usage */ -#define BPF_PPC_STACK_LOCALS 16 +#define BPF_PPC_STACK_LOCALS 24 /* stack frame excluding BPF stack, ensure this is quadword aligned */ #define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 53aefee3fe70..fcbf7a917c56 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -210,7 +210,11 @@ skip_init_ctx: /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(code_base, &cgctx); - bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); + if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass)) { + bpf_jit_binary_free(bpf_hdr); + fp = org_fp; + goto out_addrs; + } bpf_jit_build_epilogue(code_base, &cgctx); if (bpf_jit_enable > 1) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index beb12cbc8c29..0da31d41d413 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -200,7 +200,7 @@ void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 fun } } -static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) +static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* * By now, the eBPF program has already setup parameters in r3-r6 @@ -261,7 +261,9 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 bpf_jit_emit_common_epilogue(image, ctx); EMIT(PPC_RAW_BCTR()); + /* out: */ + return 0; } /* Assemble the body code between the prologue & epilogue */ @@ -355,7 +357,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * PPC_LI32(_R0, imm); EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, _R0)); } - if (imm >= 0) + if (imm >= 0 || (BPF_OP(code) == BPF_SUB && imm == 0x80000000)) EMIT(PPC_RAW_ADDZE(dst_reg_h, dst_reg_h)); else EMIT(PPC_RAW_ADDME(dst_reg_h, dst_reg_h)); @@ -623,7 +625,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * EMIT(PPC_RAW_LI(dst_reg_h, 0)); break; case BPF_ALU | BPF_ARSH | BPF_X: /* (s32) dst >>= src */ - EMIT(PPC_RAW_SRAW(dst_reg_h, dst_reg, src_reg)); + EMIT(PPC_RAW_SRAW(dst_reg, dst_reg, src_reg)); break; case BPF_ALU64 | BPF_ARSH | BPF_X: /* (s64) dst >>= src */ bpf_set_seen_register(ctx, tmp_reg); @@ -1073,7 +1075,7 @@ cond_branch: break; case BPF_JMP32 | BPF_JSET | BPF_K: /* andi does not sign-extend the immediate */ - if (imm >= -32768 && imm < 32768) { + if (imm >= 0 && imm < 32768) { /* PPC_ANDI is _only/always_ dot-form */ EMIT(PPC_RAW_ANDI(_R0, dst_reg, imm)); } else { @@ -1090,7 +1092,9 @@ cond_branch: */ case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; - bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + ret = bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + if (ret < 0) + return ret; break; default: @@ -1103,7 +1107,7 @@ cond_branch: return -EOPNOTSUPP; } if (BPF_CLASS(code) == BPF_ALU && !fp->aux->verifier_zext && - !insn_is_zext(&insn[i + 1])) + !insn_is_zext(&insn[i + 1]) && !(BPF_OP(code) == BPF_END && imm == 64)) EMIT(PPC_RAW_LI(dst_reg_h, 0)); } diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index b87a63dba9c8..8b5157ccfeba 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -15,6 +15,7 @@ #include <linux/if_vlan.h> #include <asm/kprobes.h> #include <linux/bpf.h> +#include <asm/security_features.h> #include "bpf_jit64.h" @@ -35,9 +36,9 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ prev sp ] <------------- * [ ... ] | * sp (r1) ---> [ stack pointer ] -------------- - * [ nv gpr save area ] 6*8 + * [ nv gpr save area ] 5*8 * [ tail_call_cnt ] 8 - * [ local_tmp_var ] 8 + * [ local_tmp_var ] 16 * [ unused red zone ] 208 bytes protected */ static int bpf_jit_stack_local(struct codegen_context *ctx) @@ -45,12 +46,12 @@ static int bpf_jit_stack_local(struct codegen_context *ctx) if (bpf_has_stack_frame(ctx)) return STACK_FRAME_MIN_SIZE + ctx->stack_size; else - return -(BPF_PPC_STACK_SAVE + 16); + return -(BPF_PPC_STACK_SAVE + 24); } static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) { - return bpf_jit_stack_local(ctx) + 8; + return bpf_jit_stack_local(ctx) + 16; } static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) @@ -206,7 +207,7 @@ void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 fun EMIT(PPC_RAW_BCTRL()); } -static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) +static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* * By now, the eBPF program has already setup parameters in r3, r4 and r5 @@ -267,13 +268,38 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 bpf_jit_emit_common_epilogue(image, ctx); EMIT(PPC_RAW_BCTR()); + /* out: */ + return 0; } +/* + * We spill into the redzone always, even if the bpf program has its own stackframe. + * Offsets hardcoded based on BPF_PPC_STACK_SAVE -- see bpf_jit_stack_local() + */ +void bpf_stf_barrier(void); + +asm ( +" .global bpf_stf_barrier ;" +" bpf_stf_barrier: ;" +" std 21,-64(1) ;" +" std 22,-56(1) ;" +" sync ;" +" ld 21,-64(1) ;" +" ld 22,-56(1) ;" +" ori 31,31,0 ;" +" .rept 14 ;" +" b 1f ;" +" 1: ;" +" .endr ;" +" blr ;" +); + /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, u32 *addrs, bool extra_pass) { + enum stf_barrier_type stf_barrier = stf_barrier_type_get(); const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; int i, ret; @@ -328,18 +354,25 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * EMIT(PPC_RAW_SUB(dst_reg, dst_reg, src_reg)); goto bpf_alu32_trunc; case BPF_ALU | BPF_ADD | BPF_K: /* (u32) dst += (u32) imm */ - case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */ + if (!imm) { + goto bpf_alu32_trunc; + } else if (imm >= -32768 && imm < 32768) { + EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm))); + } else { + PPC_LI32(b2p[TMP_REG_1], imm); + EMIT(PPC_RAW_ADD(dst_reg, dst_reg, b2p[TMP_REG_1])); + } + goto bpf_alu32_trunc; + case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */ - if (BPF_OP(code) == BPF_SUB) - imm = -imm; - if (imm) { - if (imm >= -32768 && imm < 32768) - EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm))); - else { - PPC_LI32(b2p[TMP_REG_1], imm); - EMIT(PPC_RAW_ADD(dst_reg, dst_reg, b2p[TMP_REG_1])); - } + if (!imm) { + goto bpf_alu32_trunc; + } else if (imm > -32768 && imm <= 32768) { + EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(-imm))); + } else { + PPC_LI32(b2p[TMP_REG_1], imm); + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, b2p[TMP_REG_1])); } goto bpf_alu32_trunc; case BPF_ALU | BPF_MUL | BPF_X: /* (u32) dst *= (u32) src */ @@ -389,8 +422,14 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * case BPF_ALU64 | BPF_DIV | BPF_K: /* dst /= imm */ if (imm == 0) return -EINVAL; - else if (imm == 1) - goto bpf_alu32_trunc; + if (imm == 1) { + if (BPF_OP(code) == BPF_DIV) { + goto bpf_alu32_trunc; + } else { + EMIT(PPC_RAW_LI(dst_reg, 0)); + break; + } + } PPC_LI32(b2p[TMP_REG_1], imm); switch (BPF_CLASS(code)) { @@ -631,6 +670,29 @@ emit_clear: * BPF_ST NOSPEC (speculation barrier) */ case BPF_ST | BPF_NOSPEC: + if (!security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) || + !security_ftr_enabled(SEC_FTR_STF_BARRIER)) + break; + + switch (stf_barrier) { + case STF_BARRIER_EIEIO: + EMIT(PPC_RAW_EIEIO() | 0x02000000); + break; + case STF_BARRIER_SYNC_ORI: + EMIT(PPC_RAW_SYNC()); + EMIT(PPC_RAW_LD(b2p[TMP_REG_1], _R13, 0)); + EMIT(PPC_RAW_ORI(_R31, _R31, 0)); + break; + case STF_BARRIER_FALLBACK: + EMIT(PPC_RAW_MFLR(b2p[TMP_REG_1])); + PPC_LI64(12, dereference_kernel_function_descriptor(bpf_stf_barrier)); + EMIT(PPC_RAW_MTCTR(12)); + EMIT(PPC_RAW_BCTRL()); + EMIT(PPC_RAW_MTLR(b2p[TMP_REG_1])); + break; + case STF_BARRIER_NONE: + break; + } break; /* @@ -993,7 +1055,9 @@ cond_branch: */ case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; - bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + ret = bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + if (ret < 0) + return ret; break; default: diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index f92bf5f6b74f..7ea873ab2e6f 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -238,11 +238,27 @@ static inline u64 isa207_find_source(u64 idx, u32 sub_idx) ret |= P(SNOOP, HIT); break; case 5: - ret = PH(LVL, REM_CCE1); - if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4)) - ret |= P(SNOOP, HIT); - else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5)) - ret |= P(SNOOP, HITM); + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + ret = REM | P(HOPS, 0); + + if (sub_idx == 0 || sub_idx == 4) + ret |= PH(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT); + else if (sub_idx == 1 || sub_idx == 5) + ret |= PH(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM); + else if (sub_idx == 2 || sub_idx == 6) + ret |= PH(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + else if (sub_idx == 3 || sub_idx == 7) + ret |= PH(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + } else { + if (sub_idx == 0) + ret = PH(LVL, L2) | LEVEL(L2) | REM | P(SNOOP, HIT) | P(HOPS, 0); + else if (sub_idx == 1) + ret = PH(LVL, L2) | LEVEL(L2) | REM | P(SNOOP, HITM) | P(HOPS, 0); + else if (sub_idx == 2 || sub_idx == 4) + ret = PH(LVL, L3) | LEVEL(L3) | REM | P(SNOOP, HIT) | P(HOPS, 0); + else if (sub_idx == 3 || sub_idx == 5) + ret = PH(LVL, L3) | LEVEL(L3) | REM | P(SNOOP, HITM) | P(HOPS, 0); + } break; case 6: ret = PH(LVL, REM_CCE2); diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 4a2cbc3dc047..ff122603989b 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -273,6 +273,8 @@ #define P(a, b) PERF_MEM_S(a, b) #define PH(a, b) (P(LVL, HIT) | P(a, b)) #define PM(a, b) (P(LVL, MISS) | P(a, b)) +#define LEVEL(x) P(LVLNUM, x) +#define REM P(REMOTE, REMOTE) int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp, u64 event_config1); int isa207_compute_mmcr(u64 event[], int n_ev, diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index bed05b644c2c..cb25acccd746 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -21,6 +21,7 @@ #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/poll.h> +#include <linux/seq_file.h> #include <linux/slab.h> #include <asm/prom.h> diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index bc15200852b7..09fafcf2d3a0 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -867,6 +867,10 @@ static int __init eeh_pseries_init(void) if (is_kdump_kernel() || reset_devices) { pr_info("Issue PHB reset ...\n"); list_for_each_entry(phb, &hose_list, list_node) { + // Skip if the slot is empty + if (list_empty(&PCI_DN(phb->dn)->child_list)) + continue; + pdn = list_first_entry(&PCI_DN(phb->dn)->child_list, struct pci_dn, list); config_addr = pseries_eeh_get_pe_config_addr(pdn); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index dab5c56ffd0e..a52af8fbf571 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1302,6 +1302,12 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct property *default_win; int reset_win_ext; + /* DDW + IOMMU on single window may fail if there is any allocation */ + if (iommu_table_in_use(tbl)) { + dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); + goto out_failed; + } + default_win = of_find_property(pdn, "ibm,dma-window", NULL); if (!default_win) goto out_failed; @@ -1356,12 +1362,6 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) query.largest_available_block, 1ULL << page_shift); - /* DDW + IOMMU on single window may fail if there is any allocation */ - if (default_win_removed && iommu_table_in_use(tbl)) { - dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); - goto out_failed; - } - len = order_base_2(query.largest_available_block << page_shift); win_name = DMA64_PROPNAME; } else { @@ -1411,18 +1411,19 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) } else { struct iommu_table *newtbl; int i; + unsigned long start = 0, end = 0; for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; /* Look for MMIO32 */ - if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) + if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { + start = pci->phb->mem_resources[i].start; + end = pci->phb->mem_resources[i].end; break; + } } - if (i == ARRAY_SIZE(pci->phb->mem_resources)) - goto out_del_list; - /* New table for using DDW instead of the default DMA window */ newtbl = iommu_pseries_alloc_table(pci->phb->node); if (!newtbl) { @@ -1432,15 +1433,15 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr, 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); - iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start, - pci->phb->mem_resources[i].end); + iommu_init_table(newtbl, pci->phb->node, start, end); pci->table_group->tables[1] = newtbl; /* Keep default DMA window stuct if removed */ if (default_win_removed) { tbl->it_size = 0; - kfree(tbl->it_map); + vfree(tbl->it_map); + tbl->it_map = NULL; } set_iommu_table_base(&dev->dev, newtbl); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 1b305e411862..8627362f613e 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -507,12 +507,27 @@ static void pseries_msi_unmask(struct irq_data *d) irq_chip_unmask_parent(d); } +static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(data); + + /* + * Do not update the MSIx vector table. It's not strictly necessary + * because the table is initialized by the underlying hypervisor, PowerVM + * or QEMU/KVM. However, if the MSIx vector entry is cleared, any further + * activation will fail. This can happen in some drivers (eg. IPR) which + * deactivate an IRQ used for testing MSI support. + */ + entry->msg = *msg; +} + static struct irq_chip pseries_pci_msi_irq_chip = { .name = "pSeries-PCI-MSI", .irq_shutdown = pseries_msi_shutdown, .irq_mask = pseries_msi_mask, .irq_unmask = pseries_msi_unmask, .irq_eoi = irq_chip_eoi_parent, + .irq_write_msi_msg = pseries_msi_write_msg, }; static struct msi_domain_info pseries_msi_domain_info = { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index c732ce5a3e1a..c5d75c02ad8b 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -945,7 +945,8 @@ static int xive_get_irqchip_state(struct irq_data *data, * interrupt to be inactive in that case. */ *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && - (xd->saved_p || !!(pq & XIVE_ESB_VAL_P)); + (xd->saved_p || (!!(pq & XIVE_ESB_VAL_P) && + !irqd_irq_disabled(data))); return 0; default: return -EINVAL; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 301a54233c7e..c28b743eba57 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -62,7 +62,6 @@ config RISCV select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if MMU && 64BIT - select HANDLE_DOMAIN_IRQ select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL @@ -163,6 +162,12 @@ config PAGE_OFFSET default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB +config KASAN_SHADOW_OFFSET + hex + depends on KASAN_GENERIC + default 0xdfffffc800000000 if 64BIT + default 0xffffffff if 32BIT + config ARCH_FLATMEM_ENABLE def_bool !NUMA @@ -561,5 +566,3 @@ menu "Power management options" source "kernel/power/Kconfig" endmenu - -source "drivers/firmware/Kconfig" diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index a2b3d9cdbc86..b00f503ec124 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -30,8 +30,7 @@ #define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) #define KASAN_SHADOW_START KERN_VIRT_START #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) -#define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << \ - (64 - KASAN_SHADOW_SCALE_SHIFT))) +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) void kasan_init(void); asmlinkage void kasan_early_init(void); diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index b933b1583c9f..34fbb3ea21d5 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -82,4 +82,5 @@ static inline int syscall_get_arch(struct task_struct *task) #endif } +asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); #endif /* _ASM_RISCV_SYSCALL_H */ diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index 893e47195e30..208e31bc5d1c 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -16,18 +16,24 @@ #ifdef CONFIG_MMU #include <linux/types.h> -#include <generated/vdso-offsets.h> +/* + * All systems with an MMU have a VDSO, but systems without an MMU don't + * support shared libraries and therefor don't have one. + */ +#ifdef CONFIG_MMU + +#define __VVAR_PAGES 1 -#ifndef CONFIG_GENERIC_TIME_VSYSCALL -struct vdso_data { -}; -#endif +#ifndef __ASSEMBLY__ +#include <generated/vdso-offsets.h> #define VDSO_SYMBOL(base, name) \ (void __user *)((unsigned long)(base) + __vdso_##name##_offset) #endif /* CONFIG_MMU */ -asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); +#endif /* !__ASSEMBLY__ */ + +#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_VDSO_H */ diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h index 4b989ae15d59..8062996c2dfd 100644 --- a/arch/riscv/include/uapi/asm/unistd.h +++ b/arch/riscv/include/uapi/asm/unistd.h @@ -18,9 +18,10 @@ #ifdef __LP64__ #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SET_GET_RLIMIT -#define __ARCH_WANT_SYS_CLONE3 #endif /* __LP64__ */ +#define __ARCH_WANT_SYS_CLONE3 + #include <asm-generic/unistd.h> /* diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 98f502654edd..64236f7efde5 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -130,8 +130,7 @@ skip_context_tracking: /* Handle interrupts */ move a0, sp /* pt_regs */ - la a1, handle_arch_irq - REG_L a1, (a1) + la a1, generic_handle_arch_irq jr a1 1: /* diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index fce5184b22c3..52c5ff9804c5 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -193,6 +193,7 @@ setup_trap_vector: csrw CSR_SCRATCH, zero ret +.align 2 .Lsecondary_park: /* We lack SMP support or have too many harts, so park this hart */ wfi diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 921d9d7df400..2f6da845c9ae 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -140,12 +140,9 @@ void arch_irq_work_raise(void) void handle_IPI(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); unsigned long *pending_ipis = &ipi_data[smp_processor_id()].bits; unsigned long *stats = ipi_data[smp_processor_id()].stats; - irq_enter(); - riscv_clear_ipi(); while (true) { @@ -156,7 +153,7 @@ void handle_IPI(struct pt_regs *regs) ops = xchg(pending_ipis, 0); if (ops == 0) - goto done; + return; if (ops & (1 << IPI_RESCHEDULE)) { stats[IPI_RESCHEDULE]++; @@ -189,10 +186,6 @@ void handle_IPI(struct pt_regs *regs) /* Order data access and bit testing. */ mb(); } - -done: - irq_exit(); - set_irq_regs(old_regs); } static const char * const ipi_names[] = { diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index a63c667c27b3..44b1420a2270 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -7,7 +7,6 @@ #include <linux/linkage.h> #include <linux/syscalls.h> #include <asm-generic/syscalls.h> -#include <asm/vdso.h> #include <asm/syscall.h> #undef __SYSCALL diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 25a3b8849599..b70956d80408 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -12,14 +12,24 @@ #include <linux/binfmts.h> #include <linux/err.h> #include <asm/page.h> +#include <asm/vdso.h> + #ifdef CONFIG_GENERIC_TIME_VSYSCALL #include <vdso/datapage.h> #else -#include <asm/vdso.h> +struct vdso_data { +}; #endif extern char vdso_start[], vdso_end[]; +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + +#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) + static unsigned int vdso_pages __ro_after_init; static struct page **vdso_pagelist __ro_after_init; @@ -38,7 +48,7 @@ static int __init vdso_init(void) vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; vdso_pagelist = - kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); + kcalloc(vdso_pages + VVAR_NR_PAGES, sizeof(struct page *), GFP_KERNEL); if (unlikely(vdso_pagelist == NULL)) { pr_err("vdso: pagelist allocation failed\n"); return -ENOMEM; @@ -63,38 +73,41 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, unsigned long vdso_base, vdso_len; int ret; - vdso_len = (vdso_pages + 1) << PAGE_SHIFT; + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + + vdso_len = (vdso_pages + VVAR_NR_PAGES) << PAGE_SHIFT; + + if (mmap_write_lock_killable(mm)) + return -EINTR; - mmap_write_lock(mm); vdso_base = get_unmapped_area(NULL, 0, vdso_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { ret = vdso_base; goto end; } - /* - * Put vDSO base into mm struct. We need to do this before calling - * install_special_mapping or the perf counter mmap tracking code - * will fail to recognise it as a vDSO (since arch_vma_name fails). - */ - mm->context.vdso = (void *)vdso_base; + mm->context.vdso = NULL; + ret = install_special_mapping(mm, vdso_base, VVAR_SIZE, + (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]); + if (unlikely(ret)) + goto end; ret = - install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + install_special_mapping(mm, vdso_base + VVAR_SIZE, + vdso_pages << PAGE_SHIFT, (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), vdso_pagelist); - if (unlikely(ret)) { - mm->context.vdso = NULL; + if (unlikely(ret)) goto end; - } - vdso_base += (vdso_pages << PAGE_SHIFT); - ret = install_special_mapping(mm, vdso_base, PAGE_SIZE, - (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]); + /* + * Put vDSO base into mm struct. We need to do this before calling + * install_special_mapping or the perf counter mmap tracking code + * will fail to recognise it as a vDSO (since arch_vma_name fails). + */ + mm->context.vdso = (void *)vdso_base + VVAR_SIZE; - if (unlikely(ret)) - mm->context.vdso = NULL; end: mmap_write_unlock(mm); return ret; @@ -105,7 +118,7 @@ const char *arch_vma_name(struct vm_area_struct *vma) if (vma->vm_mm && (vma->vm_start == (long)vma->vm_mm->context.vdso)) return "[vdso]"; if (vma->vm_mm && (vma->vm_start == - (long)vma->vm_mm->context.vdso + PAGE_SIZE)) + (long)vma->vm_mm->context.vdso - VVAR_SIZE)) return "[vdso_data]"; return NULL; } diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index e6f558bca71b..e9111f700af0 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -3,12 +3,13 @@ * Copyright (C) 2012 Regents of the University of California */ #include <asm/page.h> +#include <asm/vdso.h> OUTPUT_ARCH(riscv) SECTIONS { - PROVIDE(_vdso_data = . + PAGE_SIZE); + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 094118663285..89f81067e09e 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -16,6 +16,8 @@ static void ipi_remote_fence_i(void *info) void flush_icache_all(void) { + local_flush_icache_all(); + if (IS_ENABLED(CONFIG_RISCV_SBI)) sbi_remote_fence_i(NULL); else diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index d7189c8714a9..54294f83513d 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -17,6 +17,9 @@ asmlinkage void __init kasan_early_init(void) uintptr_t i; pgd_t *pgd = early_pg_dir + pgd_index(KASAN_SHADOW_START); + BUILD_BUG_ON(KASAN_SHADOW_OFFSET != + KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); + for (i = 0; i < PTRS_PER_PTE; ++i) set_pte(kasan_early_shadow_pte + i, mk_pte(virt_to_page(kasan_early_shadow_page), @@ -172,21 +175,10 @@ void __init kasan_init(void) phys_addr_t p_start, p_end; u64 i; - /* - * Populate all kernel virtual address space with kasan_early_shadow_page - * except for the linear mapping and the modules/kernel/BPF mapping. - */ - kasan_populate_early_shadow((void *)KASAN_SHADOW_START, - (void *)kasan_mem_to_shadow((void *) - VMEMMAP_END)); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) kasan_shallow_populate( (void *)kasan_mem_to_shadow((void *)VMALLOC_START), (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); - else - kasan_populate_early_shadow( - (void *)kasan_mem_to_shadow((void *)VMALLOC_START), - (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); /* Populate the linear mapping */ for_each_mem_range(i, &p_start, &p_end) { diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index fed86f42dfbe..753d85bdfad0 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -125,7 +125,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) if (i == NR_JIT_ITERATIONS) { pr_err("bpf-jit: image did not converge in <%d passes!\n", i); - bpf_jit_binary_free(jit_data->header); + if (jit_data->header) + bpf_jit_binary_free(jit_data->header); prog = orig_prog; goto out_offset; } @@ -166,6 +167,11 @@ out: return prog; } +u64 bpf_jit_alloc_exec_limit(void) +{ + return BPF_JIT_REGION_SIZE; +} + void *bpf_jit_alloc_exec(unsigned long size) { return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index e4803ec51110..6b3c366af78e 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -207,6 +207,8 @@ int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); +void zpci_device_reserved(struct zpci_dev *zdev); +bool zpci_is_device_configured(struct zpci_dev *zdev); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); int zpci_unregister_ioat(struct zpci_dev *, u8); diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index ef59588a3042..888a2f1c9ee3 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -67,14 +67,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lp) arch_spin_lock_wait(lp); } -static inline void arch_spin_lock_flags(arch_spinlock_t *lp, - unsigned long flags) -{ - if (!arch_spin_trylock_once(lp)) - arch_spin_lock_wait(lp); -} -#define arch_spin_lock_flags arch_spin_lock_flags - static inline int arch_spin_trylock(arch_spinlock_t *lp) { if (!arch_spin_trylock_once(lp)) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index b9f85b2dc053..6af59c59cc1b 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -894,6 +894,11 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, /** * guest_translate_address - translate guest logical into guest absolute address + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @gpa: Guest physical address + * @mode: Translation access mode * * Parameter semantics are the same as the ones from guest_translate. * The memory contents at the guest address are not changed. @@ -934,6 +939,11 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, /** * check_gva_range - test a range of guest virtual addresses for accessibility + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @length: Length of test range + * @mode: Translation access mode */ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long length, enum gacc_mode mode) @@ -956,6 +966,7 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, /** * kvm_s390_check_low_addr_prot_real - check for low-address protection + * @vcpu: virtual cpu * @gra: Guest real address * * Checks whether an address is subject to low-address protection and set @@ -979,6 +990,7 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) * @pgt: pointer to the beginning of the page table for the given address if * successful (return value 0), or to the first invalid DAT entry in * case of exceptions (return value > 0) + * @dat_protection: referenced memory is write protected * @fake: pgt references contiguous guest memory block, not a pgtable */ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 72b25b7cc6ae..2bd8f854f1b4 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -269,6 +269,7 @@ static int handle_prog(struct kvm_vcpu *vcpu) /** * handle_external_interrupt - used for external interruption interceptions + * @vcpu: virtual cpu * * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if * the new PSW does not have external interrupts disabled. In the first case, @@ -315,7 +316,8 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) } /** - * Handle MOVE PAGE partial execution interception. + * handle_mvpg_pei - Handle MOVE PAGE partial execution interception. + * @vcpu: virtual cpu * * This interception can only happen for guests with DAT disabled and * addresses that are currently not mapped in the host. Thus we try to diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 10722455fd02..2245f4b8d362 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3053,13 +3053,14 @@ static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; + u8 vcpu_isc_mask; for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; - deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); - if (deliverable_mask) { + vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask & vcpu_isc_mask) { /* lately kicked but not yet running */ if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6a6dd5e1daf6..1c97493d21e1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3363,6 +3363,7 @@ out_free_sie_block: int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); return kvm_s390_vcpu_has_irq(vcpu, 0); } diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index cfcdf76d6a95..a95ca6df4e5e 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -259,14 +259,13 @@ EXPORT_SYMBOL(strcmp); #ifdef __HAVE_ARCH_STRRCHR char *strrchr(const char *s, int c) { - size_t len = __strend(s) - s; - - if (len) - do { - if (s[len] == (char) c) - return (char *) s + len; - } while (--len > 0); - return NULL; + ssize_t len = __strend(s) - s; + + do { + if (s[len] == (char)c) + return (char *)s + len; + } while (--len >= 0); + return NULL; } EXPORT_SYMBOL(strrchr); #endif diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 840d8594437d..1a374d021e25 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1826,7 +1826,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); if (jit.addrs == NULL) { fp = orig_fp; - goto out; + goto free_addrs; } /* * Three initial passes: diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index e7e6788d75a8..b833155ce838 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -92,7 +92,7 @@ void zpci_remove_reserved_devices(void) spin_unlock(&zpci_list_lock); list_for_each_entry_safe(zdev, tmp, &remove, entry) - zpci_zdev_put(zdev); + zpci_device_reserved(zdev); } int pci_domain_nr(struct pci_bus *bus) @@ -751,6 +751,14 @@ error: return ERR_PTR(rc); } +bool zpci_is_device_configured(struct zpci_dev *zdev) +{ + enum zpci_state state = zdev->state; + + return state != ZPCI_FN_STATE_RESERVED && + state != ZPCI_FN_STATE_STANDBY; +} + /** * zpci_scan_configured_device() - Scan a freshly configured zpci_dev * @zdev: The zpci_dev to be configured @@ -822,6 +830,31 @@ int zpci_deconfigure_device(struct zpci_dev *zdev) return 0; } +/** + * zpci_device_reserved() - Mark device as resverved + * @zdev: the zpci_dev that was reserved + * + * Handle the case that a given zPCI function was reserved by another system. + * After a call to this function the zpci_dev can not be found via + * get_zdev_by_fid() anymore but may still be accessible via existing + * references though it will not be functional anymore. + */ +void zpci_device_reserved(struct zpci_dev *zdev) +{ + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + /* + * Remove device from zpci_list as it is going away. This also + * makes sure we ignore subsequent zPCI events for this device. + */ + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zdev->state = ZPCI_FN_STATE_RESERVED; + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + zpci_zdev_put(zdev); +} + void zpci_release_device(struct kref *kref) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); @@ -843,6 +876,12 @@ void zpci_release_device(struct kref *kref) case ZPCI_FN_STATE_STANDBY: if (zdev->has_hp_slot) zpci_exit_slot(zdev); + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + fallthrough; + case ZPCI_FN_STATE_RESERVED: if (zdev->has_resources) zpci_cleanup_bus_resources(zdev); zpci_bus_device_unregister(zdev); @@ -851,10 +890,6 @@ void zpci_release_device(struct kref *kref) default: break; } - - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); zpci_dbg(3, "rem fid:%x\n", zdev->fid); kfree(zdev); } diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index c856f80cb21b..5b8d647523f9 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -140,7 +140,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) /* The 0x0304 event may immediately reserve the device */ if (!clp_get_state(zdev->fid, &state) && state == ZPCI_FN_STATE_RESERVED) { - zpci_zdev_put(zdev); + zpci_device_reserved(zdev); } } break; @@ -151,7 +151,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0308: /* Standby -> Reserved */ if (!zdev) break; - zpci_zdev_put(zdev); + zpci_device_reserved(zdev); break; default: break; diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 372afa82fee6..c7a97f32432f 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -42,7 +42,8 @@ extern void flush_cache_page(struct vm_area_struct *vma, extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *page); +void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_user_range flush_icache_range extern void flush_icache_page(struct vm_area_struct *vma, diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index cd9dc0556e91..69d2d0049a61 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -27,6 +27,7 @@ #include <linux/blk-mq.h> #include <linux/ata.h> #include <linux/hdreg.h> +#include <linux/major.h> #include <linux/cdrom.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -854,8 +855,8 @@ static const struct attribute_group *ubd_attr_groups[] = { NULL, }; -static void ubd_disk_register(int major, u64 size, int unit, - struct gendisk *disk) +static int ubd_disk_register(int major, u64 size, int unit, + struct gendisk *disk) { disk->major = major; disk->first_minor = unit << UBD_SHIFT; @@ -872,7 +873,7 @@ static void ubd_disk_register(int major, u64 size, int unit, disk->private_data = &ubd_devs[unit]; disk->queue = ubd_devs[unit].queue; - device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups); + return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups); } #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE)) @@ -919,10 +920,15 @@ static int ubd_add(int n, char **error_out) blk_queue_write_cache(ubd_dev->queue, true, false); blk_queue_max_segments(ubd_dev->queue, MAX_SG); blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); - ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk); + err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk); + if (err) + goto out_cleanup_disk; + ubd_gendisk[n] = disk; return 0; +out_cleanup_disk: + blk_cleanup_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&ubd_dev->tag_set); out: diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index a149a5e9a16a..54447690de11 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -421,6 +421,10 @@ void __init check_bugs(void) os_check_bugs(); } +void apply_retpolines(s32 *start, s32 *end) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 349e59b2f0e3..cdaeccc9127d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1416,7 +1416,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1536,7 +1536,6 @@ config AMD_MEM_ENCRYPT config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT bool "Activate AMD Secure Memory Encryption (SME) by default" - default y depends on AMD_MEM_ENCRYPT help Say yes to have system memory encrypted by default if running on @@ -2843,8 +2842,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -source "drivers/firmware/Kconfig" - source "arch/x86/kvm/Kconfig" source "arch/x86/Kconfig.assembler" diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 18d2f5199194..1cc72b4804fa 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -78,7 +78,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -133,6 +133,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index d2ffd7f76ee2..9c5d3f3ad45a 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -93,7 +93,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -148,6 +148,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 960a021d543e..7e25543693de 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -453,3 +453,4 @@ 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret 448 i386 process_mrelease sys_process_mrelease +449 i386 futex_waitv sys_futex_waitv diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 18b5500ea8bf..fe8f8dd157b4 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -370,6 +370,7 @@ 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6dfa8ddaa60f..38b2c779146f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -66,6 +66,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); +DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); + DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); @@ -1215,6 +1217,8 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; + static_call_cond(x86_pmu_assign)(event, idx); + switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: @@ -2005,6 +2009,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); + static_call_update(x86_pmu_assign, x86_pmu.assign); + static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6320d2cfd9d3..974e917e65b2 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -209,6 +209,12 @@ static void bts_update(struct bts_ctx *bts) } else { local_set(&buf->data_size, head); } + + /* + * Since BTS is coherent, just add compiler barrier to ensure + * BTS updating is ordered against bts::handle::event. + */ + barrier(); } static int diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9a044438072b..aa3f5b527dc0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -243,7 +243,8 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ @@ -288,7 +289,7 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = { static struct event_constraint intel_spr_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ @@ -2403,6 +2404,12 @@ static void intel_pmu_disable_event(struct perf_event *event) intel_pmu_pebs_disable(event); } +static void intel_pmu_assign_event(struct perf_event *event, int idx) +{ + if (is_pebs_pt(event)) + perf_report_aux_output_id(event, idx); +} + static void intel_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -4495,8 +4502,16 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value) return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; } +static void intel_aux_output_init(void) +{ + /* Refer also intel_pmu_aux_output_match() */ + if (x86_pmu.intel_cap.pebs_output_pt_available) + x86_pmu.assign = intel_pmu_assign_event; +} + static int intel_pmu_aux_output_match(struct perf_event *event) { + /* intel_pmu_assign_event() is needed, refer intel_aux_output_init() */ if (!x86_pmu.intel_cap.pebs_output_pt_available) return 0; @@ -6302,6 +6317,8 @@ __init int intel_pmu_init(void) if (is_hybrid()) intel_pmu_check_hybrid_pmus((u64)fixed_mask); + intel_aux_output_init(); + return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8647713276a7..4dbb55a43dad 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -923,7 +923,8 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { }; struct event_constraint intel_icl_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */ INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -943,7 +944,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { }; struct event_constraint intel_spr_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe), diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 7280c8a3c831..6d735611c281 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -30,7 +30,7 @@ #define uncore_discovery_invalid_unit(unit) \ - (!unit.table1 || !unit.ctl || !unit.table3 || \ + (!unit.table1 || !unit.ctl || \ unit.table1 == -1ULL || unit.ctl == -1ULL || \ unit.table3 == -1ULL) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 5ddc0f30db6f..eb2c6cea9d0d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -452,7 +452,7 @@ #define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0 /* ICX IMC */ -#define ICX_NUMBER_IMC_CHN 2 +#define ICX_NUMBER_IMC_CHN 3 #define ICX_IMC_MEM_STRIDE 0x4 /* SPR */ @@ -5076,8 +5076,10 @@ static struct event_constraint icx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x02, 0x3), UNCORE_EVENT_CONSTRAINT(0x03, 0x3), UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0x88, 0xc), UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; @@ -5463,7 +5465,7 @@ static struct intel_uncore_ops icx_uncore_mmio_ops = { static struct intel_uncore_type icx_uncore_imc = { .name = "imc", .num_counters = 4, - .num_boxes = 8, + .num_boxes = 12, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, @@ -5647,6 +5649,7 @@ static struct intel_uncore_type spr_uncore_chabox = { .event_mask = SPR_CHA_PMON_EVENT_MASK, .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, .num_shared_regs = 1, + .constraints = skx_uncore_chabox_constraints, .ops = &spr_uncore_chabox_ops, .format_group = &spr_uncore_chabox_format_group, .attr_update = uncore_alias_groups, @@ -5658,6 +5661,7 @@ static struct intel_uncore_type spr_uncore_iio = { .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, .format_group = &snr_uncore_iio_format_group, .attr_update = uncore_alias_groups, + .constraints = icx_uncore_iio_constraints, }; static struct attribute *spr_uncore_raw_formats_attr[] = { @@ -5686,9 +5690,16 @@ static struct intel_uncore_type spr_uncore_irp = { }; +static struct event_constraint spr_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x14, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type spr_uncore_m2pcie = { SPR_UNCORE_COMMON_FORMAT(), .name = "m2pcie", + .constraints = spr_uncore_m2pcie_constraints, }; static struct intel_uncore_type spr_uncore_pcu = { @@ -5765,6 +5776,7 @@ static struct intel_uncore_type spr_uncore_upi = { static struct intel_uncore_type spr_uncore_m3upi = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "m3upi", + .constraints = icx_uncore_m3upi_constraints, }; static struct intel_uncore_type spr_uncore_mdf = { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index c853b28efa33..96c775abe31f 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -68,6 +68,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_ATOM_SILVERMONT: case INTEL_FAM6_ATOM_SILVERMONT_D: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e3ac05c97b5e..76436a55d9ba 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -726,6 +726,7 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*assign)(struct perf_event *event, int idx); void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 32a1ad356c18..db2d92fb44da 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -122,17 +122,27 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, ipi_arg->reserved = 0; ipi_arg->vp_set.valid_bank_mask = 0; - if (!cpumask_equal(mask, cpu_present_mask)) { + /* + * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET + * when the IPI is sent to all currently present CPUs. + */ + if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; if (exclude_self) nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask); else nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); - } - if (nr_bank < 0) - goto ipi_mask_ex_done; - if (!nr_bank) + + /* + * 'nr_bank <= 0' means some CPUs in cpumask can't be + * represented in VP_SET. Return an error and fall back to + * native (architectural) method of sending IPIs. + */ + if (nr_bank <= 0) + goto ipi_mask_ex_done; + } else { ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; + } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, ipi_arg, NULL); diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h index 1b07fb102c4e..07949102a08d 100644 --- a/arch/x86/include/asm/GEN-for-each-reg.h +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -1,11 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are in machine order; things rely on that. + */ #ifdef CONFIG_64BIT GEN(rax) -GEN(rbx) GEN(rcx) GEN(rdx) +GEN(rbx) +GEN(rsp) +GEN(rbp) GEN(rsi) GEN(rdi) -GEN(rbp) GEN(r8) GEN(r9) GEN(r10) @@ -16,10 +21,11 @@ GEN(r14) GEN(r15) #else GEN(eax) -GEN(ebx) GEN(ecx) GEN(edx) +GEN(ebx) +GEN(esp) +GEN(ebp) GEN(esi) GEN(edi) -GEN(ebp) #endif diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index a3c2315aca12..58eee6402832 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,6 +75,7 @@ extern int alternatives_patched; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); struct module; diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 4cb726c71ed8..8f80de627c60 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -17,21 +17,3 @@ extern void cmpxchg8b_emu(void); #endif -#ifdef CONFIG_RETPOLINE - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_thunk_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_call_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 14ebd2196569..43184640b579 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -25,7 +25,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) * For !SMAP hardware we patch out CLAC on entry. */ if (boot_cpu_has(X86_FEATURE_SMAP) || - (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV))) mask |= X86_EFLAGS_AC; WARN_ON_ONCE(flags & mask); diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 562854c60808..ea0c5ab31da4 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -185,6 +185,7 @@ IRQ_CONSTRAINTS, regs, vector); \ } +#ifndef CONFIG_PREEMPT_RT #define ASM_CALL_SOFTIRQ \ "call %P[__func] \n" @@ -201,6 +202,8 @@ __this_cpu_write(hardirq_stack_inuse, false); \ } +#endif + #else /* CONFIG_X86_64 */ /* System vector handlers always run on the stack they interrupted. */ #define run_sysvec_on_irqstack_cond(func, regs) \ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8f48a7ec577..13f64654dfff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -702,7 +702,8 @@ struct kvm_vcpu_arch { struct kvm_pio_request pio; void *pio_data; - void *guest_ins_data; + void *sev_pio_data; + unsigned sev_pio_count; u8 event_exit_inst_len; @@ -1097,7 +1098,7 @@ struct kvm_arch { u64 cur_tsc_generation; int nr_vcpus_matched_tsc; - spinlock_t pvclock_gtod_sync_lock; + raw_spinlock_t pvclock_gtod_sync_lock; bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ec2d5c8c6694..cc74dc584836 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -5,12 +5,15 @@ #include <linux/static_key.h> #include <linux/objtool.h> +#include <linux/linkage.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> +#define RETPOLINE_THUNK_SIZE 32 + /* * Fill the CPU return stack buffer. * @@ -118,6 +121,16 @@ ".popsection\n\t" #ifdef CONFIG_RETPOLINE + +typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + +extern retpoline_thunk_t __x86_indirect_thunk_array[]; + #ifdef CONFIG_X86_64 /* @@ -303,63 +316,4 @@ static inline void mds_idle_clear_cpu_buffers(void) #endif /* __ASSEMBLY__ */ -/* - * Below is used in the eBPF JIT compiler and emits the byte sequence - * for the following assembly: - * - * With retpolines configured: - * - * callq do_rop - * spec_trap: - * pause - * lfence - * jmp spec_trap - * do_rop: - * mov %rcx,(%rsp) for x86_64 - * mov %edx,(%esp) for x86_32 - * retq - * - * Without retpolines configured: - * - * jmp *%rcx for x86_64 - * jmp *%edx for x86_32 - */ -#ifdef CONFIG_RETPOLINE -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 17 -# define RETPOLINE_RCX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* callq do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ - EMIT1(0xC3); /* retq */ \ -} while (0) -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* call do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \ - EMIT1(0xC3); /* ret */ \ -} while (0) -# endif -#else /* !CONFIG_RETPOLINE */ -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 2 -# define RETPOLINE_RCX_BPF_JIT() \ - EMIT2(0xFF, 0xE1); /* jmp *%rcx */ -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ - EMIT2(0xFF, 0xE2) /* jmp *%edx */ -# endif -#endif - #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index da3a1ac82be5..cebec95a7124 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -52,11 +52,11 @@ void __init paravirt_set_cap(void); /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); #endif } @@ -113,12 +113,12 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, /* * These special macros can be used to get or set a debugging register */ -static inline unsigned long paravirt_get_debugreg(int reg) +static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) -static inline void set_debugreg(unsigned long val, int reg) +static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } @@ -133,14 +133,14 @@ static inline void write_cr0(unsigned long x) PVOP_VCALL1(cpu.write_cr0, x); } -static inline unsigned long read_cr2(void) +static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline void write_cr2(unsigned long x) +static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } @@ -653,10 +653,10 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func -#define PV_CALLEE_SAVE_REGS_THUNK(func) \ +#define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ - asm(".pushsection .text;" \ + asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ PV_THUNK_NAME(func) ":" \ @@ -669,6 +669,9 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") +#define PV_CALLEE_SAVE_REGS_THUNK(func) \ + __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") + /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) @@ -678,23 +681,23 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL -static inline notrace unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_disable(void) +static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_enable(void) +static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace unsigned long arch_local_irq_save(void) +static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index b94f615600d5..703663175a5a 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -181,7 +181,7 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp -static inline bool ip_within_syscall_gap(struct pt_regs *regs) +static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) { bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 454b20815f35..4a7ff8b0db20 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -308,13 +308,13 @@ HYPERVISOR_platform_op(struct xen_platform_op *op) return _hypercall1(int, platform_op, op); } -static inline int +static __always_inline int HYPERVISOR_set_debugreg(int reg, unsigned long value) { return _hypercall2(int, set_debugreg, reg, value); } -static inline unsigned long +static __always_inline unsigned long HYPERVISOR_get_debugreg(int reg) { return _hypercall1(unsigned long, get_debugreg, reg); @@ -358,7 +358,7 @@ HYPERVISOR_event_channel_op(int cmd, void *arg) return _hypercall2(int, event_channel_op, cmd, arg); } -static inline int +static __always_inline int HYPERVISOR_xen_version(int cmd, void *arg) { return _hypercall2(int, xen_version, cmd, arg); diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 3506d8c598c1..4557f7cb0fa6 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -14,16 +14,19 @@ static inline int pci_xen_hvm_init(void) return -1; } #endif -#if defined(CONFIG_XEN_DOM0) +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void); -int xen_find_device_domain_owner(struct pci_dev *dev); -int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); -int xen_unregister_device_domain_owner(struct pci_dev *dev); #else static inline int __init pci_xen_initial_domain(void) { return -1; } +#endif +#ifdef CONFIG_XEN_DOM0 +int xen_find_device_domain_owner(struct pci_dev *dev); +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); +int xen_unregister_device_domain_owner(struct pci_dev *dev); +#else static inline int xen_find_device_domain_owner(struct pci_dev *dev) { return -1; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e9da3dc71254..23fb4d51a5da 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -29,6 +29,7 @@ #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> +#include <asm/asm-prototypes.h> int __read_mostly alternatives_patched; @@ -113,6 +114,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } } +extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -221,7 +223,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { struct insn insn; int i = 0; @@ -239,11 +241,11 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins * optimized. */ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, a->instrlen, i); + i += optimize_nops_range(instr, len, i); else i += insn.length; - if (i >= a->instrlen) + if (i >= len) return; } } @@ -331,10 +333,185 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); next: - optimize_nops(a, instr); + optimize_nops(instr, a->instrlen); } } +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,amd when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) + return -1; + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + for (; i < insn->length;) + bytes[i++] = BYTES_NOP1; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ + #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) @@ -643,6 +820,12 @@ void __init alternative_instructions(void) apply_paravirt(__parainstructions, __parainstructions_end); /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + + /* * Then patch alternatives, such that those paravirt calls that are in * alternatives can be overwritten by their immediate fragments. */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ecfca3bbcd96..ba43597f1027 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -882,13 +882,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); - return SPECTRE_V2_CMD_AUTO; - } - spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1c7897c33327..8be4b08fd299 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -329,6 +329,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_SMAP cr4_set_bits(X86_CR4_SMAP); #else + clear_cpu_cap(c, X86_FEATURE_SMAP); cr4_clear_bits(X86_CR4_SMAP); #endif } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4b8813bafffd..bb1c3f5f60c8 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(d); + kfree(hw_dom); return; } if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(d); + kfree(hw_dom->ctrl_val); + kfree(hw_dom->mbps_val); + kfree(hw_dom); return; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 38837dad46e6..391a4e2b8604 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -714,12 +714,6 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3e20, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3ec4, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x8a12, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 445c57c9c539..831b25c5e705 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -379,9 +379,14 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, sizeof(fpu->state.fxsave))) return -EFAULT; - /* Reject invalid MXCSR values. */ - if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) - return -EINVAL; + if (IS_ENABLED(CONFIG_X86_64)) { + /* Reject invalid MXCSR values. */ + if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + } else { + /* Mask invalid bits out for historical reasons (broken hardware). */ + fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 42fc41dd0e1f..882213df3713 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -10,6 +10,7 @@ #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/mwait.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -916,6 +917,83 @@ static bool __init hpet_counting(void) return false; } +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ @@ -929,6 +1007,9 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; + if (hpet_is_pc10_damaged()) + return 0; + hpet_set_mapping(); if (!hpet_virt_address) return 0; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 044902d5a3c4..e5dd6da78713 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct irq_stack *irqstk; @@ -148,6 +149,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 8ef35063964b..760e1f293093 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,9 +7,11 @@ /* * unsigned long native_save_fl(void) */ +.pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX ret SYM_FUNC_END(native_save_fl) +.popsection EXPORT_SYMBOL(native_save_fl) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5e9a34b5bd74..169fb6f4cd2e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -251,7 +251,8 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -267,8 +268,14 @@ int module_finalize(const Elf_Ehdr *hdr, orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; } + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 04cafc057bed..ebc45360ffd4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -218,6 +218,36 @@ void paravirt_end_context_switch(struct task_struct *next) if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } + +static noinstr unsigned long pv_native_read_cr2(void) +{ + return native_read_cr2(); +} + +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} + +static noinstr unsigned long pv_native_get_debugreg(int regno) +{ + return native_get_debugreg(regno); +} + +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +{ + native_set_debugreg(regno, val); +} + +static noinstr void pv_native_irq_enable(void) +{ + native_irq_enable(); +} + +static noinstr void pv_native_irq_disable(void) +{ + native_irq_disable(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -244,8 +274,8 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, - .cpu.get_debugreg = native_get_debugreg, - .cpu.set_debugreg = native_set_debugreg, + .cpu.get_debugreg = pv_native_get_debugreg, + .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, @@ -281,8 +311,8 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ @@ -298,8 +328,8 @@ struct paravirt_patch_template pv_ops = { .mmu.exit_mmap = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), - .mmu.write_cr2 = native_write_cr2, + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), + .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, @@ -371,9 +401,6 @@ struct paravirt_patch_template pv_ops = { }; #ifdef CONFIG_PARAVIRT_XXL -/* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_get_debugreg); -NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); void (*paravirt_iret)(void) = native_iret; diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 9f90f460a28c..ff1e82ff52d9 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -64,7 +64,7 @@ static bool sev_es_negotiate_protocol(void) static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { ghcb->save.sw_exit_code = 0; - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } static bool vc_decoding_needed(unsigned long exit_code) @@ -130,6 +130,8 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, } else { ret = ES_VMM_ERROR; } + } else if (ghcb->save.sw_exit_info_1 & 0xffffffff) { + ret = ES_VMM_ERROR; } else { ret = ES_OK; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index efd9e9ea17f2..3d6dc12d198f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -272,6 +272,20 @@ SECTIONS __parainstructions_end = .; } +#ifdef CONFIG_RETPOLINE + /* + * List of instructions that call/jmp/jcc to retpoline thunks + * __x86_indirect_thunk_*(). These instructions can be patched along + * with alternatives, after which the section can be freed. + */ + . = ALIGN(8); + .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) { + __retpoline_sites = .; + *(.retpoline_sites) + __retpoline_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 76fb00921203..d6ac32f3f650 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2321,13 +2321,14 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; + u64 msr_val; int i; if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; + msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + msr_val |= MSR_IA32_APICBASE_BSP; + kvm_lapic_set_base(vcpu, msr_val); } if (!apic) @@ -2336,11 +2337,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) { - apic->base_address = APIC_DEFAULT_PHYS_BASE; - + /* The xAPIC ID is set at RESET even if the APIC was already enabled. */ + if (!init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -2481,6 +2480,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } + /* + * Stuff the APIC ENABLE bit in lieu of temporarily incrementing + * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset(). + */ + vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -2942,5 +2946,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); + WARN_ON(static_branch_unlikely(&apic_hw_disabled.key)); static_key_deferred_flush(&apic_sw_disabled); + WARN_ON(static_branch_unlikely(&apic_sw_disabled.key)); } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1a64ba5b9437..0cc58901bf7a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4596,10 +4596,10 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu) unsigned bit; bool wp; - if (!is_cr4_pke(mmu)) { - mmu->pkru_mask = 0; + mmu->pkru_mask = 0; + + if (!is_cr4_pke(mmu)) return; - } wp = is_cr0_wp(mmu); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c36b5fe4c27c..7e34d7163ada 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -618,7 +618,12 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; vmsa.address = __sme_pa(svm->vmsa); vmsa.len = PAGE_SIZE; - return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); + if (ret) + return ret; + + vcpu->arch.guest_state_protected = true; + return 0; } static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) @@ -1479,6 +1484,13 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_trans; } + /* + * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP + * encrypts the written data with the guest's key, and the cache may + * contain dirty, unencrypted data. + */ + sev_clflush_pages(guest_page, n); + /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; @@ -2579,11 +2591,20 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) { - if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2)) + int count; + int bytes; + + if (svm->vmcb->control.exit_info_2 > INT_MAX) + return -EINVAL; + + count = svm->vmcb->control.exit_info_2; + if (unlikely(check_mul_overflow(count, size, &bytes))) + return -EINVAL; + + if (!setup_vmgexit_scratch(svm, in, bytes)) return -EINVAL; - return kvm_sev_es_string_io(&svm->vcpu, size, port, - svm->ghcb_sa, svm->ghcb_sa_len, in); + return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->ghcb_sa, count, in); } void sev_es_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 128a54b1fbf1..e63ac08115cf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -191,7 +191,7 @@ struct vcpu_svm { /* SEV-ES scratch area support */ void *ghcb_sa; - u64 ghcb_sa_len; + u32 ghcb_sa_len; bool ghcb_sa_sync; bool ghcb_sa_free; @@ -218,12 +218,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); void recalc_intercepts(struct vcpu_svm *svm); -static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) +static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) { return container_of(kvm, struct kvm_svm, kvm); } -static inline bool sev_guest(struct kvm *kvm) +static __always_inline bool sev_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -234,7 +234,7 @@ static inline bool sev_guest(struct kvm *kvm) #endif } -static inline bool sev_es_guest(struct kvm *kvm) +static __always_inline bool sev_es_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -271,7 +271,7 @@ static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) return !test_bit(bit, (unsigned long *)&vmcb->control.clean); } -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); } diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 22e2b019de37..9430d6437c9f 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid) * VMSAVE, VMLOAD, etc... is still controlled by the effective address size, * hence 'unsigned long' instead of 'hpa_t'. */ -static inline void vmsave(unsigned long pa) +static __always_inline void vmsave(unsigned long pa) { svm_asm1(vmsave, "a" (pa), "memory"); } -static inline void vmload(unsigned long pa) +static __always_inline void vmload(unsigned long pa) { svm_asm1(vmload, "a" (pa), "memory"); } diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 152ab0aa82cf..16731d2cf231 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -93,7 +93,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -static inline void evmcs_write64(unsigned long field, u64 value) +static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -183,7 +183,7 @@ static inline void evmcs_load(u64 phys_addr) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} static inline void evmcs_write16(unsigned long field, u16 value) {} static inline u64 evmcs_read64(unsigned long field) { return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 116b08904ac3..7d595effb66f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5562,9 +5562,13 @@ static int handle_encls(struct kvm_vcpu *vcpu) static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { - vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; - vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; - return 0; + /* + * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK + * VM-Exits. Unconditionally set the flag here and leave the handling to + * vmx_handle_exit(). + */ + to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + return 1; } /* @@ -6051,9 +6055,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) int ret = __vmx_handle_exit(vcpu, exit_fastpath); /* - * Even when current exit reason is handled by KVM internally, we - * still need to exit to user space when bus lock detected to inform - * that there is a bus lock in guest. + * Exit to user space when bus lock detected to inform that there is + * a bus lock in guest. */ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { if (ret > 0) @@ -6302,18 +6305,13 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) /* * If we are running L2 and L1 has a new pending interrupt - * which can be injected, we should re-evaluate - * what should be done with this new L1 interrupt. - * If L1 intercepts external-interrupts, we should - * exit from L2 to L1. Otherwise, interrupt should be - * delivered directly to L2. + * which can be injected, this may cause a vmexit or it may + * be injected into L2. Either way, this interrupt will be + * processed via KVM_REQ_EVENT, not RVI, because we do not use + * virtual interrupt delivery to inject L1 interrupts into L2. */ - if (is_guest_mode(vcpu) && max_irr_updated) { - if (nested_exit_on_intr(vcpu)) - kvm_vcpu_exiting_guest_mode(vcpu); - else - kvm_make_request(KVM_REQ_EVENT, vcpu); - } + if (is_guest_mode(vcpu) && max_irr_updated) + kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aabd3a2ec1bc..bfe0de3008a6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2542,7 +2542,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -2550,7 +2550,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } kvm_track_tsc_matching(vcpu); - spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2780,9 +2780,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2800,15 +2800,15 @@ u64 get_kvmclock_ns(struct kvm *kvm) unsigned long flags; u64 ret; - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -2902,13 +2902,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -6100,13 +6100,13 @@ set_pit2_out: * is slightly ahead) here we risk going negative on unsigned * 'system_time' when 'user_ns.clock' is very small. */ - spin_lock_irq(&ka->pvclock_gtod_sync_lock); + raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock); if (kvm->arch.use_master_clock) now_ns = ka->master_kernel_ns; else now_ns = get_kvmclock_base_ns(); ka->kvmclock_offset = user_ns.clock - now_ns; - spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock); kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; @@ -6906,7 +6906,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, + unsigned short port, unsigned int count, bool in) { vcpu->arch.pio.port = port; @@ -6914,10 +6914,8 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) return 1; - } vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -6929,26 +6927,39 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, unsigned int count) { - int ret; + WARN_ON(vcpu->arch.pio.count); + memset(vcpu->arch.pio_data, 0, size * count); + return emulator_pio_in_out(vcpu, size, port, count, true); +} - if (vcpu->arch.pio.count) - goto data_avail; +static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) +{ + int size = vcpu->arch.pio.size; + unsigned count = vcpu->arch.pio.count; + memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); + vcpu->arch.pio.count = 0; +} - memset(vcpu->arch.pio_data, 0, size * count); +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) +{ + if (vcpu->arch.pio.count) { + /* Complete previous iteration. */ + } else { + int r = __emulator_pio_in(vcpu, size, port, count); + if (!r) + return r; - ret = emulator_pio_in_out(vcpu, size, port, val, count, true); - if (ret) { -data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); - vcpu->arch.pio.count = 0; - return 1; + /* Results already available, fall through. */ } - return 0; + WARN_ON(count != vcpu->arch.pio.count); + complete_emulator_pio_in(vcpu, val); + return 1; } static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, @@ -6963,9 +6974,15 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { + int ret; + memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); + ret = emulator_pio_in_out(vcpu, size, port, count, false); + if (ret) + vcpu->arch.pio.count = 0; + + return ret; } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -8139,9 +8156,9 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -8783,9 +8800,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); + + /* + * The call to kvm_ready_for_interrupt_injection() may end up in + * kvm_xen_has_interrupt() which may require the srcu lock to be + * held, to protect against changes in the vcpu_info address. + */ + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; @@ -9643,14 +9668,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (unlikely(kvm_vcpu_exit_request(vcpu))) { + if (vcpu->arch.apicv_active) + static_call(kvm_x86_sync_pir_to_irr)(vcpu); + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; break; } - - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); - } + } /* * Do this here before restoring debug registers on the host. And @@ -11182,7 +11207,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm); @@ -11392,7 +11417,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, int level = i + 1; int lpages = __kvm_mmu_slot_lpages(slot, npages, level); - WARN_ON(slot->arch.rmap[i]); + if (slot->arch.rmap[i]) + continue; slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { @@ -12367,44 +12393,81 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); -static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port); + +static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu) { - memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, - vcpu->arch.pio.count * vcpu->arch.pio.size); - vcpu->arch.pio.count = 0; + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + vcpu->arch.pio.count = 0; + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_outs(vcpu, size, port); return 1; } static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port) { - int ret; - - ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) - return ret; + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); + + /* memcpy done already by emulator_pio_out. */ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + if (!ret) + break; - vcpu->arch.pio.count = 0; + /* Emulation done by the kernel. */ + if (!vcpu->arch.sev_pio_count) + return 1; + } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs; return 0; } static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port); + +static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { - int ret; + unsigned count = vcpu->arch.pio.count; + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; +} - ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) { - vcpu->arch.pio.count = 0; - } else { - vcpu->arch.guest_ins_data = data; - vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +{ + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + + advance_sev_es_emulated_ins(vcpu); + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_ins(vcpu, size, port); + return 1; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port) +{ + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + if (!__emulator_pio_in(vcpu, size, port, count)) + break; + + /* Emulation done by the kernel. */ + advance_sev_es_emulated_ins(vcpu); + if (!vcpu->arch.sev_pio_count) + return 1; } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; return 0; } @@ -12412,8 +12475,10 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in) { - return in ? kvm_sev_es_ins(vcpu, size, port, data, count) - : kvm_sev_es_outs(vcpu, size, port, data, count); + vcpu->arch.sev_pio_data = data; + vcpu->arch.sev_pio_count = count; + return in ? kvm_sev_es_ins(vcpu, size, port) + : kvm_sev_es_outs(vcpu, size, port); } EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9ea9c3dabe37..8f62baebd028 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -190,6 +190,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + int err; u8 rc = 0; /* @@ -216,13 +217,29 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (likely(slots->generation == ghc->generation && !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { /* Fast path */ - __get_user(rc, (u8 __user *)ghc->hva + offset); - } else { - /* Slow path */ - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + pagefault_disable(); + err = __get_user(rc, (u8 __user *)ghc->hva + offset); + pagefault_enable(); + if (!err) + return rc; } + /* Slow path */ + + /* + * This function gets called from kvm_vcpu_block() after setting the + * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately + * from a HLT. So we really mustn't sleep. If the page ended up absent + * at that point, just return 1 in order to trigger an immediate wake, + * and we'll end up getting called again from a context where we *can* + * fault in the page and wait for it. + */ + if (in_atomic() || !task_is_running(current)) + return 1; + + kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, + sizeof(rc)); + return rc; } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index ec9922cba30a..cf0b39f97adc 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -28,46 +28,14 @@ .macro THUNK reg - .align 32 - -SYM_FUNC_START(__x86_indirect_thunk_\reg) + .align RETPOLINE_THUNK_SIZE +SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD -SYM_FUNC_END(__x86_indirect_thunk_\reg) - -.endm - -/* - * This generates .altinstr_replacement symbols for use by objtool. They, - * however, must not actually live in .altinstr_replacement since that will be - * discarded after init, but module alternatives will also reference these - * symbols. - * - * Their names matches the "__x86_indirect_" prefix to mark them as retpolines. - */ -.macro ALT_THUNK reg - - .align 1 - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) - ANNOTATE_RETPOLINE_SAFE -1: call *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_call_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg) - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) - ANNOTATE_RETPOLINE_SAFE -1: jmp *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) - .endm /* @@ -85,22 +53,16 @@ STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_thunk_array) + #define GEN(reg) THUNK reg #include <asm/GEN-for-each-reg.h> - #undef GEN -#define GEN(reg) EXPORT_THUNK(reg) -#include <asm/GEN-for-each-reg.h> -#undef GEN -#define GEN(reg) ALT_THUNK reg -#include <asm/GEN-for-each-reg.h> + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_thunk_array) -#undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg) +#define GEN(reg) EXPORT_THUNK(reg) #include <asm/GEN-for-each-reg.h> - #undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg) -#include <asm/GEN-for-each-reg.h> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9ea57389c554..39c802525fce 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -15,7 +15,6 @@ #include <asm/set_memory.h> #include <asm/nospec-branch.h> #include <asm/text-patching.h> -#include <asm/asm-prototypes.h> static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -225,6 +224,14 @@ static void jit_fill_hole(void *area, unsigned int size) struct jit_context { int cleanup_addr; /* Epilogue code offset */ + + /* + * Program specific offsets of labels in the code; these rely on the + * JIT doing at least 2 passes, recording the position on the first + * pass, only to generate the correct offset on the second pass. + */ + int tail_call_direct_label; + int tail_call_indirect_label; }; /* Maximum number of bytes emitted while JITing one eBPF insn */ @@ -380,20 +387,23 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } -static int get_pop_bytes(bool *callee_regs_used) +#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) + +static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { - int bytes = 0; + u8 *prog = *pprog; - if (callee_regs_used[3]) - bytes += 2; - if (callee_regs_used[2]) - bytes += 2; - if (callee_regs_used[1]) - bytes += 2; - if (callee_regs_used[0]) - bytes += 1; +#ifdef CONFIG_RETPOLINE + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + EMIT_LFENCE(); + EMIT2(0xFF, 0xE0 + reg); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + } else +#endif + EMIT2(0xFF, 0xE0 + reg); - return bytes; + *pprog = prog; } /* @@ -411,29 +421,12 @@ static int get_pop_bytes(bool *callee_regs_used) * out: */ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, - u32 stack_depth) + u32 stack_depth, u8 *ip, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 42; - int off2 = 31; - int off3 = 9; - - /* count the additional bytes used for popping callee regs from stack - * that need to be taken into account for each of the offsets that - * are used for bailing out of the tail call - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; - off2 += pop_bytes; - off3 += pop_bytes; - - if (stack_depth) { - off1 += 7; - off2 += 7; - off3 += 7; - } + u8 *prog = *pprog, *start = *pprog; + int offset; /* * rdi - pointer to ctx @@ -448,8 +441,9 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ - EMIT2(X86_JBE, OFFSET1); /* jbe out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JBE, offset); /* jbe out */ /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) @@ -457,8 +451,9 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JA, OFFSET2); /* ja out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JA, offset); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ @@ -471,12 +466,11 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * goto out; */ EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ -#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JE, OFFSET3); /* je out */ - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JE, offset); /* je out */ + + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) @@ -493,41 +487,21 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * rdi == ctx (1st arg) * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - RETPOLINE_RCX_BPF_JIT(); + emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start)); /* out: */ + ctx->tail_call_indirect_label = prog - start; *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image, - bool *callee_regs_used, u32 stack_depth) + u8 **pprog, u8 *ip, + bool *callee_regs_used, u32 stack_depth, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 20; - int poke_off; - - /* count the additional bytes used for popping callee regs to stack - * that need to be taken into account for jump offset that is used for - * bailing out from of the tail call when limit is reached - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; - - /* - * total bytes for: - * - nop5/ jmpq $off - * - pop callee regs - * - sub rsp, $val if depth > 0 - * - pop rax - */ - poke_off = X86_PATCH_SIZE + pop_bytes + 1; - if (stack_depth) { - poke_off += 7; - off1 += 7; - } + u8 *prog = *pprog, *start = *pprog; + int offset; /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) @@ -535,28 +509,30 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, off1); /* ja out */ + + offset = ctx->tail_call_direct_label - (prog + 2 - start); + EMIT2(X86_JA, offset); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ - poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->tailcall_bypass = ip + (prog - start); poke->adj_off = X86_TAIL_CALL_OFFSET; - poke->tailcall_target = image + (addr - X86_PATCH_SIZE); + poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE; poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, poke->tailcall_bypass); - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; + /* out: */ + ctx->tail_call_direct_label = prog - start; *pprog = prog; } @@ -1222,8 +1198,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* speculation barrier */ case BPF_ST | BPF_NOSPEC: if (boot_cpu_has(X86_FEATURE_XMM2)) - /* Emit 'lfence' */ - EMIT3(0x0F, 0xAE, 0xE8); + EMIT_LFENCE(); break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -1412,13 +1387,16 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image, + &prog, image + addrs[i - 1], callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + ctx); else emit_bpf_tail_call_indirect(&prog, callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + image + addrs[i - 1], + ctx); break; /* cond jump */ @@ -2124,24 +2102,6 @@ cleanup: return ret; } -static int emit_fallback_jump(u8 **pprog) -{ - u8 *prog = *pprog; - int err = 0; - -#ifdef CONFIG_RETPOLINE - /* Note that this assumes the the compiler uses external - * thunks for indirect calls. Both clang and GCC use the same - * naming convention for external thunks. - */ - err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); -#else - EMIT2(0xFF, 0xE2); /* jmp rdx */ -#endif - *pprog = prog; - return err; -} - static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; @@ -2163,9 +2123,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) if (err) return err; - err = emit_fallback_jump(&prog); /* jmp thunk/indirect */ - if (err) - return err; + emit_indirect_jump(&prog, 2 /* rdx */, prog); *pprog = prog; return 0; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3bfda5f502cb..da9b7cfa4632 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -15,6 +15,7 @@ #include <asm/cacheflush.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> +#include <asm/asm-prototypes.h> #include <linux/bpf.h> /* @@ -1267,6 +1268,21 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) *pprog = prog; } +static int emit_jmp_edx(u8 **pprog, u8 *ip) +{ + u8 *prog = *pprog; + int cnt = 0; + +#ifdef CONFIG_RETPOLINE + EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5)); +#else + EMIT2(0xFF, 0xE2); +#endif + *pprog = prog; + + return cnt; +} + /* * Generate the following code: * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... @@ -1280,7 +1296,7 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call(u8 **pprog, u8 *ip) { u8 *prog = *pprog; int cnt = 0; @@ -1362,7 +1378,7 @@ static void emit_bpf_tail_call(u8 **pprog) * eax == ctx (1st arg) * edx == prog->bpf_func + prologue_size */ - RETPOLINE_EDX_BPF_JIT(); + cnt += emit_jmp_edx(&prog, ip + cnt); if (jmp_label1 == -1) jmp_label1 = cnt; @@ -2122,7 +2138,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; } case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + emit_bpf_tail_call(&prog, image + addrs[i - 1]); break; /* cond jump */ diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 3d41a09c2c14..5debe4ac6f81 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -113,7 +113,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, false /* no mapping of GSI to PIRQ */); } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static int xen_register_gsi(u32 gsi, int triggering, int polarity) { int rc, irq; @@ -261,7 +261,7 @@ error: return irq; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static bool __read_mostly pci_seg_supported = true; static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) @@ -375,10 +375,10 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev) WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret); } } -#else /* CONFIG_XEN_DOM0 */ +#else /* CONFIG_XEN_PV_DOM0 */ #define xen_initdom_setup_msi_irqs NULL #define xen_initdom_restore_msi_irqs NULL -#endif /* !CONFIG_XEN_DOM0 */ +#endif /* !CONFIG_XEN_PV_DOM0 */ static void xen_teardown_msi_irqs(struct pci_dev *dev) { @@ -555,7 +555,7 @@ int __init pci_xen_hvm_init(void) return 0; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void) { int irq; @@ -583,6 +583,9 @@ int __init pci_xen_initial_domain(void) } return 0; } +#endif + +#ifdef CONFIG_XEN_DOM0 struct xen_device_domain_owner { domid_t domain; @@ -656,4 +659,4 @@ int xen_unregister_device_domain_owner(struct pci_dev *dev) return 0; } EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); -#endif +#endif /* CONFIG_XEN_DOM0 */ diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index ee2beda590d0..1d4a00e767ec 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -274,7 +274,7 @@ static struct olpc_ec_driver ec_xo1_driver = { static struct olpc_ec_driver ec_xo1_5_driver = { .ec_cmd = olpc_xo1_ec_cmd, -#ifdef CONFIG_OLPC_XO1_5_SCI +#ifdef CONFIG_OLPC_XO15_SCI /* * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is * compiled in diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 9ac7457f52a3..ed0442e35434 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -16,15 +16,15 @@ /* * PVH variables. * - * pvh_bootparams and pvh_start_info need to live in the data segment since + * pvh_bootparams and pvh_start_info need to live in a data segment since * they are used after startup_{32|64}, which clear .bss, are invoked. */ -struct boot_params pvh_bootparams __section(".data"); -struct hvm_start_info pvh_start_info __section(".data"); +struct boot_params __initdata pvh_bootparams; +struct hvm_start_info __initdata pvh_start_info; -unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +const unsigned int __initconst pvh_start_info_sz = sizeof(pvh_start_info); -static u64 pvh_get_root_pointer(void) +static u64 __init pvh_get_root_pointer(void) { return pvh_start_info.rsdp_paddr; } @@ -107,7 +107,7 @@ void __init __weak xen_pvh_init(struct boot_params *boot_params) BUG(); } -static void hypervisor_specific_init(bool xen_guest) +static void __init hypervisor_specific_init(bool xen_guest) { if (xen_guest) xen_pvh_init(&pvh_bootparams); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index afc1da68b06d..6bcd3d8ca6ac 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -43,13 +43,9 @@ config XEN_PV_SMP def_bool y depends on XEN_PV && SMP -config XEN_DOM0 - bool "Xen PV Dom0 support" - default y - depends on XEN_PV && PCI_XEN && SWIOTLB_XEN - depends on X86_IO_APIC && ACPI && PCI - help - Support running as a Xen PV Dom0 guest. +config XEN_PV_DOM0 + def_bool y + depends on XEN_PV && XEN_DOM0 config XEN_PVHVM def_bool y @@ -86,3 +82,12 @@ config XEN_PVH def_bool n help Support for running as a Xen PVH guest. + +config XEN_DOM0 + bool "Xen Dom0 support" + default XEN_PV + depends on (XEN_PV && SWIOTLB_XEN) || (XEN_PVH && X86_64) + depends on X86_IO_APIC && ACPI && PCI + select X86_X2APIC if XEN_PVH && X86_64 + help + Support running as a Xen Dom0 guest. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 40b5779fce21..4953260e281c 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,7 +45,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += vga.o +obj-$(CONFIG_XEN_PV_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c79bd0af2e8c..95d970359e17 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -3,6 +3,7 @@ #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG #include <linux/memblock.h> #endif +#include <linux/console.h> #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/slab.h> @@ -10,12 +11,15 @@ #include <xen/xen.h> #include <xen/features.h> +#include <xen/interface/sched.h> +#include <xen/interface/version.h> #include <xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> #include <asm/cpu.h> #include <asm/e820/api.h> +#include <asm/setup.h> #include "xen-ops.h" #include "smp.h" @@ -52,9 +56,6 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); -enum xen_domain_type xen_domain_type = XEN_NATIVE; -EXPORT_SYMBOL_GPL(xen_domain_type); - unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); unsigned long machine_to_phys_nr; @@ -69,10 +70,12 @@ __read_mostly int xen_have_vector_callback; EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* - * NB: needs to live in .data because it's used by xen_prepare_pvh which runs - * before clearing the bss. + * NB: These need to live in .data or alike because they're used by + * xen_prepare_pvh() which runs before clearing the bss. */ -uint32_t xen_start_flags __section(".data") = 0; +enum xen_domain_type __ro_after_init xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); +uint32_t __ro_after_init xen_start_flags; EXPORT_SYMBOL(xen_start_flags); /* @@ -258,6 +261,45 @@ int xen_vcpu_setup(int cpu) return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); } +void __init xen_banner(void) +{ + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + + pr_info("Booting kernel on %s\n", pv_info.name); + pr_info("Xen version: %u.%u%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) + ? " (preserve-AD)" : ""); +} + +/* Check if running on Xen version (major, minor) or later */ +bool xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} + +void __init xen_add_preferred_consoles(void) +{ + add_preferred_console("xenboot", 0, NULL); + if (!boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + if (boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); +} + void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 6e0d0754f94f..4f63117f09bb 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -28,7 +28,6 @@ #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/highmem.h> -#include <linux/console.h> #include <linux/pci.h> #include <linux/gfp.h> #include <linux/edd.h> @@ -109,17 +108,6 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); -static void __init xen_banner(void) -{ - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); - struct xen_extraversion extra; - HYPERVISOR_xen_version(XENVER_extraversion, &extra); - - pr_info("Booting paravirtualized kernel on %s\n", pv_info.name); - pr_info("Xen version: %d.%d%s (preserve-AD)\n", - version >> 16, version & 0xffff, extra.extraversion); -} - static void __init xen_pv_init_platform(void) { populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); @@ -142,22 +130,6 @@ static void __init xen_pv_guest_late_init(void) #endif } -/* Check if running on Xen version (major, minor) or later */ -bool -xen_running_on_version_or_later(unsigned int major, unsigned int minor) -{ - unsigned int version; - - if (!xen_domain()) - return false; - - version = HYPERVISOR_xen_version(XENVER_version, NULL); - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || - ((version >> 16) > major)) - return true; - return false; -} - static __read_mostly unsigned int cpuid_leaf5_ecx_val; static __read_mostly unsigned int cpuid_leaf5_edx_val; @@ -311,12 +283,12 @@ static void __init xen_init_capabilities(void) } } -static void xen_set_debugreg(int reg, unsigned long val) +static noinstr void xen_set_debugreg(int reg, unsigned long val) { HYPERVISOR_set_debugreg(reg, val); } -static unsigned long xen_get_debugreg(int reg) +static noinstr unsigned long xen_get_debugreg(int reg) { return HYPERVISOR_get_debugreg(reg); } @@ -1053,52 +1025,54 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .cpuid = xen_cpuid, +static const typeof(pv_ops) xen_cpu_ops __initconst = { + .cpu = { + .cpuid = xen_cpuid, - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, - .write_cr4 = xen_write_cr4, + .write_cr4 = xen_write_cr4, - .wbinvd = native_wbinvd, + .wbinvd = native_wbinvd, - .read_msr = xen_read_msr, - .write_msr = xen_write_msr, + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, - .read_msr_safe = xen_read_msr_safe, - .write_msr_safe = xen_write_msr_safe, + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, - .read_pmc = xen_read_pmc, + .read_pmc = xen_read_pmc, - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, - .load_gs_index = xen_load_gs_index, + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + .load_gs_index = xen_load_gs_index, - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, - .store_tr = xen_store_tr, + .store_tr = xen_store_tr, - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_sp0 = xen_load_sp0, + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_sp0 = xen_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM - .invalidate_io_bitmap = xen_invalidate_io_bitmap, - .update_io_bitmap = xen_update_io_bitmap, + .invalidate_io_bitmap = xen_invalidate_io_bitmap, + .update_io_bitmap = xen_update_io_bitmap, #endif - .io_delay = xen_io_delay, + .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, - .end_context_switch = xen_end_context_switch, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, + }, }; static void xen_restart(char *msg) @@ -1239,7 +1213,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.cpu = xen_cpu_ops; + pv_ops.cpu = xen_cpu_ops.cpu; paravirt_iret = xen_iret; xen_init_irq_ops(); @@ -1364,7 +1338,6 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { - add_preferred_console("xenboot", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; x86_platform.set_legacy_features = @@ -1409,11 +1382,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } - if (!boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); - add_preferred_console("hvc", 0, NULL); - if (boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); + xen_add_preferred_consoles(); #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 0d5e34b9e6f9..bcae606bbc5c 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> +#include <linux/export.h> #include <xen/hvc-console.h> @@ -18,10 +19,11 @@ /* * PVH variables. * - * The variable xen_pvh needs to live in the data segment since it is used + * The variable xen_pvh needs to live in a data segment since it is used * after startup_{32|64} is invoked, which will clear the .bss segment. */ -bool xen_pvh __section(".data") = 0; +bool __ro_after_init xen_pvh; +EXPORT_SYMBOL_GPL(xen_pvh); void __init xen_pvh_init(struct boot_params *boot_params) { @@ -36,6 +38,10 @@ void __init xen_pvh_init(struct boot_params *boot_params) pfn = __pa(hypercall_page); wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + if (xen_initial_domain()) + x86_init.oem.arch_setup = xen_add_preferred_consoles; + x86_init.oem.banner = xen_banner; + xen_efi_init(boot_params); } diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index dfa091d79c2e..4fe387e520af 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -19,12 +19,12 @@ * callback mask. We do this in a very simple manner, by making a call * down into Xen. The pending flag will be checked by Xen on return. */ -void xen_force_evtchn_callback(void) +noinstr void xen_force_evtchn_callback(void) { (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage __visible unsigned long xen_save_fl(void) +asmlinkage __visible noinstr unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -40,9 +40,9 @@ asmlinkage __visible unsigned long xen_save_fl(void) */ return (-flags) & X86_EFLAGS_IF; } -PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); +__PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl, ".noinstr.text"); -asmlinkage __visible void xen_irq_disable(void) +asmlinkage __visible noinstr void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -51,9 +51,9 @@ asmlinkage __visible void xen_irq_disable(void) this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; preempt_enable_no_resched(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); +__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable, ".noinstr.text"); -asmlinkage __visible void xen_irq_enable(void) +asmlinkage __visible noinstr void xen_irq_enable(void) { struct vcpu_info *vcpu; @@ -76,7 +76,7 @@ asmlinkage __visible void xen_irq_enable(void) preempt_enable(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); +__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable, ".noinstr.text"); static void xen_safe_halt(void) { @@ -94,17 +94,20 @@ static void xen_halt(void) xen_safe_halt(); } -static const struct pv_irq_ops xen_irq_ops __initconst = { - .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), - .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), +static const typeof(pv_ops) xen_irq_ops __initconst = { + .irq = { - .safe_halt = xen_safe_halt, - .halt = xen_halt, + .save_fl = PV_CALLEE_SAVE(xen_save_fl), + .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), + .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), + + .safe_halt = xen_safe_halt, + .halt = xen_halt, + }, }; void __init xen_init_irq_ops(void) { - pv_ops.irq = xen_irq_ops; + pv_ops.irq = xen_irq_ops.irq; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 8d751939c6f3..1ce436eeda15 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1204,7 +1204,8 @@ static void __init xen_pagetable_init(void) xen_remap_memory(); xen_setup_mfn_list_list(); } -static void xen_write_cr2(unsigned long cr2) + +static noinstr void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; } @@ -2078,67 +2079,69 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), - .write_cr2 = xen_write_cr2, +static const typeof(pv_ops) xen_mmu_ops __initconst = { + .mmu = { + .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), + .write_cr2 = xen_write_cr2, - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3_init, + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3_init, - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_one_user = xen_flush_tlb_one_user, - .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_one_user = xen_flush_tlb_one_user, + .flush_tlb_multi = xen_flush_tlb_multi, + .tlb_remove_table = tlb_remove_table, - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pmd_init, - .release_pmd = xen_release_pmd_init, + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pmd_init, + .release_pmd = xen_release_pmd_init, - .set_pte = xen_set_pte_init, - .set_pmd = xen_set_pmd_hyper, + .set_pte = xen_set_pte_init, + .set_pmd = xen_set_pmd_hyper, - .ptep_modify_prot_start = xen_ptep_modify_prot_start, - .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, + .ptep_modify_prot_start = xen_ptep_modify_prot_start, + .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - .set_pud = xen_set_pud_hyper, + .set_pud = xen_set_pud_hyper, - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_p4d = xen_set_p4d_hyper, + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, #if CONFIG_PGTABLE_LEVELS >= 5 - .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), - .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), + .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), + .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), #endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, - }, + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, + }, - .set_fixmap = xen_set_fixmap, + .set_fixmap = xen_set_fixmap, + }, }; void __init xen_init_mmu_ops(void) @@ -2146,7 +2149,7 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.hyper.init_after_bootmem = xen_after_bootmem; - pv_ops.mmu = xen_mmu_ops; + pv_ops.mmu = xen_mmu_ops.mmu; memset(dummy_mapping, 0xff, PAGE_SIZE); } @@ -2398,7 +2401,7 @@ static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data) int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, - unsigned int domid, bool no_translate, struct page **pages) + unsigned int domid, bool no_translate) { int err = 0; struct remap_data rmd; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1e626444712b..220dd9678494 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -21,6 +21,45 @@ #include <linux/init.h> #include <linux/linkage.h> +.pushsection .noinstr.text, "ax" +/* + * Disabling events is simply a matter of making the event mask + * non-zero. + */ +SYM_FUNC_START(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + ret +SYM_FUNC_END(xen_irq_disable_direct) + +/* + * Force an event check by making a hypercall, but preserve regs + * before making the call. + */ +SYM_FUNC_START(check_events) + FRAME_BEGIN + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call xen_force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax + FRAME_END + ret +SYM_FUNC_END(check_events) + /* * Enable events. This clears the event mask and tests the pending * event status with one and operation. If there are pending events, @@ -47,16 +86,6 @@ SYM_FUNC_START(xen_irq_enable_direct) ret SYM_FUNC_END(xen_irq_enable_direct) - -/* - * Disabling events is simply a matter of making the event mask - * non-zero. - */ -SYM_FUNC_START(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - ret -SYM_FUNC_END(xen_irq_disable_direct) - /* * (xen_)save_fl is used to get the current interrupt enable status. * Callers expect the status to be in X86_EFLAGS_IF, and other bits @@ -73,35 +102,6 @@ SYM_FUNC_START(xen_save_fl_direct) ret SYM_FUNC_END(xen_save_fl_direct) -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -SYM_FUNC_START(check_events) - FRAME_BEGIN - push %rax - push %rcx - push %rdx - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - call xen_force_evtchn_callback - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rdx - pop %rcx - pop %rax - FRAME_END - ret -SYM_FUNC_END(check_events) - SYM_FUNC_START(xen_read_cr2) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX @@ -116,6 +116,7 @@ SYM_FUNC_START(xen_read_cr2_direct) FRAME_END ret SYM_FUNC_END(xen_read_cr2_direct); +.popsection .macro xen_pv_trap name SYM_CODE_START(xen_\name) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index cb6538ae2fe0..9e27b86a0c31 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -20,6 +20,23 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> +.pushsection .noinstr.text, "ax" + .balign PAGE_SIZE +SYM_CODE_START(hypercall_page) + .rept (PAGE_SIZE / 32) + UNWIND_HINT_FUNC + .skip 31, 0x90 + ret + .endr + +#define HYPERCALL(n) \ + .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ + .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 +#include <asm/xen-hypercalls.h> +#undef HYPERCALL +SYM_CODE_END(hypercall_page) +.popsection + #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -64,23 +81,6 @@ SYM_CODE_END(asm_cpu_bringup_and_idle) #endif #endif -.pushsection .text - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - .skip 31, 0x90 - ret - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include <asm/xen-hypercalls.h> -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8d7ec49a35fb..8bc8b72a205d 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -51,6 +51,7 @@ void __init xen_remap_memory(void); phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); +void xen_banner(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); @@ -109,7 +110,7 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, @@ -118,6 +119,8 @@ static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, } #endif +void xen_add_preferred_consoles(void); + void __init xen_init_apic(void); #ifdef CONFIG_XEN_EFI diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index cf907e5bf2f2..a8a041609c5d 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -120,7 +120,8 @@ void flush_cache_page(struct vm_area_struct*, #define flush_cache_vunmap(start,end) flush_cache_all() #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page*); +void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *); void local_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -137,7 +138,9 @@ void local_flush_cache_page(struct vm_area_struct *vma, #define flush_cache_vunmap(start,end) do { } while (0) #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO #define flush_dcache_page(page) do { } while (0) +static inline void flush_dcache_folio(struct folio *folio) { } #define flush_icache_range local_flush_icache_range #define flush_cache_page(vma, addr, pfn) do { } while (0) diff --git a/arch/xtensa/include/asm/kmem_layout.h b/arch/xtensa/include/asm/kmem_layout.h index 7cbf68ca7106..6fc05cba61a2 100644 --- a/arch/xtensa/include/asm/kmem_layout.h +++ b/arch/xtensa/include/asm/kmem_layout.h @@ -78,7 +78,7 @@ #endif #define XCHAL_KIO_SIZE 0x10000000 -#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_OF) +#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_USE_OF) #define XCHAL_KIO_PADDR xtensa_get_kio_paddr() #ifndef __ASSEMBLY__ extern unsigned long xtensa_kio_paddr; diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c index 764b54bef701..15051a8a1539 100644 --- a/arch/xtensa/kernel/irq.c +++ b/arch/xtensa/kernel/irq.c @@ -143,7 +143,7 @@ unsigned xtensa_get_ext_irq_no(unsigned irq) void __init init_IRQ(void) { -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF irqchip_init(); #else #ifdef CONFIG_HAVE_SMP diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index ed184106e4cf..ee9082a142fe 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -63,7 +63,7 @@ extern unsigned long initrd_end; extern int initrd_below_start_ok; #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF void *dtb_start = __dtb_start; #endif @@ -125,7 +125,7 @@ __tagtable(BP_TAG_INITRD, parse_tag_initrd); #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF static int __init parse_tag_fdt(const bp_tag_t *tag) { @@ -135,7 +135,7 @@ static int __init parse_tag_fdt(const bp_tag_t *tag) __tagtable(BP_TAG_FDT, parse_tag_fdt); -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ static int __init parse_tag_cmdline(const bp_tag_t* tag) { @@ -183,7 +183,7 @@ static int __init parse_bootparam(const bp_tag_t *tag) } #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF #if !XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY unsigned long xtensa_kio_paddr = XCHAL_KIO_DEFAULT_PADDR; @@ -232,7 +232,7 @@ void __init early_init_devtree(void *params) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); } -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ /* * Initialize architecture. (Early stage) @@ -253,7 +253,7 @@ void __init init_arch(bp_tag_t *bp_start) if (bp_start) parse_bootparam(bp_start); -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF early_init_devtree(dtb_start); #endif diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index 7e4d97dc8bd8..38acda4f04e8 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -101,7 +101,7 @@ void init_mmu(void) void init_kio(void) { -#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY && defined(CONFIG_OF) +#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY && defined(CONFIG_USE_OF) /* * Update the IO area mapping in case xtensa_kio_paddr has changed */ diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 3cdfa00738e0..07b642c1916a 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -100,7 +100,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector, spin_unlock(&dev->lock); } -static blk_qc_t simdisk_submit_bio(struct bio *bio) +static void simdisk_submit_bio(struct bio *bio) { struct simdisk *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; @@ -118,7 +118,6 @@ static blk_qc_t simdisk_submit_bio(struct bio *bio) } bio_endio(bio); - return BLK_QC_T_NONE; } static int simdisk_open(struct block_device *bdev, fmode_t mode) @@ -259,6 +258,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which, struct proc_dir_entry *procdir) { char tmp[2] = { '0' + which, 0 }; + int err = -ENOMEM; dev->fd = -1; dev->filename = NULL; @@ -267,7 +267,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which, dev->gd = blk_alloc_disk(NUMA_NO_NODE); if (!dev->gd) - return -ENOMEM; + goto out; dev->gd->major = simdisk_major; dev->gd->first_minor = which; dev->gd->minors = SIMDISK_MINORS; @@ -275,10 +275,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which, dev->gd->private_data = dev; snprintf(dev->gd->disk_name, 32, "simdisk%d", which); set_capacity(dev->gd, 0); - add_disk(dev->gd); + err = add_disk(dev->gd); + if (err) + goto out_cleanup_disk; dev->procfile = proc_create_data(tmp, 0644, procdir, &simdisk_proc_ops, dev); + return 0; + +out_cleanup_disk: + blk_cleanup_disk(dev->gd); +out: + return err; } static int __init simdisk_init(void) diff --git a/arch/xtensa/platforms/xtfpga/setup.c b/arch/xtensa/platforms/xtfpga/setup.c index 4f7d6142d41f..538e6748e85a 100644 --- a/arch/xtensa/platforms/xtfpga/setup.c +++ b/arch/xtensa/platforms/xtfpga/setup.c @@ -51,8 +51,12 @@ void platform_power_off(void) void platform_restart(void) { - /* Flush and reset the mmu, simulate a processor reset, and - * jump to the reset vector. */ + /* Try software reset first. */ + WRITE_ONCE(*(u32 *)XTFPGA_SWRST_VADDR, 0xdead); + + /* If software reset did not work, flush and reset the mmu, + * simulate a processor reset, and jump to the reset vector. + */ cpu_reset(); /* control never gets here */ } @@ -66,7 +70,7 @@ void __init platform_calibrate_ccount(void) #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF static void __init xtfpga_clk_setup(struct device_node *np) { @@ -284,4 +288,4 @@ static int __init xtavnet_init(void) */ arch_initcall(xtavnet_init); -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ diff --git a/block/Kconfig b/block/Kconfig index 8e28ae7718bd..c6ce41a5e5b2 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -73,7 +73,7 @@ config BLK_DEV_ZONED config BLK_DEV_THROTTLING bool "Block layer bio throttling support" - depends on BLK_CGROUP=y + depends on BLK_CGROUP select BLK_CGROUP_RWSTAT help Block layer bio throttling support. It can be used to limit @@ -112,7 +112,7 @@ config BLK_WBT_MQ config BLK_CGROUP_IOLATENCY bool "Enable support for latency based cgroup IO protection" - depends on BLK_CGROUP=y + depends on BLK_CGROUP help Enabling this option enables the .latency interface for IO throttling. The IO controller will attempt to maintain average IO latencies below @@ -132,7 +132,7 @@ config BLK_CGROUP_FC_APPID config BLK_CGROUP_IOCOST bool "Enable support for cost model based cgroup IO controller" - depends on BLK_CGROUP=y + depends on BLK_CGROUP select BLK_RQ_IO_DATA_LEN select BLK_RQ_ALLOC_TIME help @@ -190,39 +190,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK by falling back to the kernel crypto API when inline encryption hardware is not present. -menu "Partition Types" - source "block/partitions/Kconfig" -endmenu - -endif # BLOCK - config BLOCK_COMPAT - bool - depends on BLOCK && COMPAT - default y + def_bool COMPAT config BLK_MQ_PCI - bool - depends on BLOCK && PCI - default y + def_bool PCI config BLK_MQ_VIRTIO bool - depends on BLOCK && VIRTIO + depends on VIRTIO default y config BLK_MQ_RDMA bool - depends on BLOCK && INFINIBAND + depends on INFINIBAND default y config BLK_PM - def_bool BLOCK && PM + def_bool PM # do not use in new code config BLOCK_HOLDER_DEPRECATED bool source "block/Kconfig.iosched" + +endif # BLOCK diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2f2158e05a91..885fee86dfca 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -if BLOCK - menu "IO Schedulers" config MQ_IOSCHED_DEADLINE @@ -45,5 +43,3 @@ config BFQ_CGROUP_DEBUG files in a cgroup which can be useful for debugging. endmenu - -endif diff --git a/block/Makefile b/block/Makefile index 41aa1ba69c90..44df57e562bf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,13 +3,13 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ +obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ - disk-events.o + disk-events.o blk-ia-ranges.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o @@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/bdev.c b/block/bdev.c index cf2780cb44a7..b4dab2fb6a74 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -12,6 +12,7 @@ #include <linux/major.h> #include <linux/device_cgroup.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> @@ -184,14 +185,13 @@ int sb_min_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_min_blocksize); -int __sync_blockdev(struct block_device *bdev, int wait) +int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); + return filemap_flush(bdev->bd_inode->i_mapping); } +EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block @@ -199,7 +199,9 @@ int __sync_blockdev(struct block_device *bdev, int wait) */ int sync_blockdev(struct block_device *bdev) { - return __sync_blockdev(bdev, 1); + if (!bdev) + return 0; + return filemap_write_and_wait(bdev->bd_inode->i_mapping); } EXPORT_SYMBOL(sync_blockdev); @@ -326,12 +328,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_disk->queue, 0); + result = blk_queue_enter(bdev_get_queue(bdev), 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_READ); - blk_queue_exit(bdev->bd_disk->queue); + blk_queue_exit(bdev_get_queue(bdev)); return result; } @@ -362,7 +364,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_disk->queue, 0); + result = blk_queue_enter(bdev_get_queue(bdev), 0); if (result) return result; @@ -375,7 +377,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, clean_page_buffers(page); unlock_page(page); } - blk_queue_exit(bdev->bd_disk->queue); + blk_queue_exit(bdev_get_queue(bdev)); return result; } @@ -490,14 +492,15 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); - bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; + bdev->bd_queue = disk->queue; bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); return NULL; } + bdev->bd_disk = disk; return bdev; } @@ -962,9 +965,11 @@ EXPORT_SYMBOL(blkdev_put); * @pathname: special file representing the block device * @dev: return value of the block device's dev_t * - * Get a reference to the blockdevice at @pathname in the current - * namespace if possible and return it. Return ERR_PTR(error) - * otherwise. + * Lookup the block device's dev_t at @pathname in the current + * namespace if possible and return it by @dev. + * + * RETURNS: + * 0 if succeeded, errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { @@ -1016,7 +1021,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) } EXPORT_SYMBOL(__invalidate_device); -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; @@ -1047,8 +1052,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); + if (!bdev->bd_openers) { + ; /* skip */ + } else if (wait) { + /* + * We keep the error status of individual mapping so + * that applications can catch the writeback error using + * fsync(2). See filemap_fdatawait_keep_errors() for + * details. + */ + filemap_fdatawait_keep_errors(inode->i_mapping); + } else { + filemap_fdatawrite(inode->i_mapping); + } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index e2f14508f2d6..24a5c5329bcd 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -6,13 +6,13 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> -#include <linux/elevator.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> +#include "elevator.h" #include "bfq-iosched.h" #ifdef CONFIG_BFQ_CGROUP_DEBUG @@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) - return -ENOMEM; + goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || @@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || - bfq_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } + bfq_stat_init(&stats->empty_time, gfp)) + goto error; #endif return 0; + +error: + bfqg_stats_exit(stats); + return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) @@ -666,6 +668,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(bfqq_group(bfqq)); + if (entity->parent && + entity->parent->last_bfqq_created == bfqq) + entity->parent->last_bfqq_created = NULL; + else if (bfqd->last_bfqq_created == bfqq) + bfqd->last_bfqq_created = NULL; + entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 480e1a134859..fec18118dc30 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -117,7 +117,6 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> -#include <linux/elevator.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> @@ -127,6 +126,7 @@ #include <trace/events/block.h> +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" @@ -6884,8 +6884,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) struct blk_mq_tags *tags = hctx->sched_tags; unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); + min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6b47cddbbca1..d25114715459 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -6,7 +6,7 @@ * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/mempool.h> #include <linux/export.h> #include <linux/bio.h> @@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, + bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; diff --git a/block/bio.c b/block/bio.c index a6fb6a0b4295..15ab0d6d1c06 100644 --- a/block/bio.c +++ b/block/bio.c @@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size) snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, - ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; @@ -156,7 +157,7 @@ out: void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); + BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); @@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table, atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; @@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio); * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ -void bio_truncate(struct bio *bio, unsigned new_size) +static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; @@ -677,7 +679,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs) void bio_put(struct bio *bio) { if (unlikely(bio_flagged(bio, BIO_REFFED))) { - BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } @@ -772,6 +774,23 @@ const char *bio_devname(struct bio *bio, char *buf) } EXPORT_SYMBOL(bio_devname); +/** + * bio_full - check if the bio is full + * @bio: bio to check + * @len: length of one segment to be added + * + * Return true if @bio is full and one segment with @len bytes can't be + * added to the bio, otherwise return false + */ +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + static inline bool page_is_mergeable(const struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, bool *same_page) @@ -791,6 +810,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); } +/** + * __bio_try_merge_page - try appending data to an existing bvec. + * @bio: destination bio + * @page: start page to add + * @len: length of the data to add + * @off: offset of the data relative to @page + * @same_page: return if the segment has been merged inside the same page + * + * Try to add the data at @page + @off to the last bvec of @bio. This is a + * useful optimisation for file systems with a block size smaller than the + * page size. + * + * Warn if (@len, @off) crosses pages in case that @same_page is true. + * + * Return %true on success or %false on failure. + */ +static bool __bio_try_merge_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off, bool *same_page) +{ + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return false; + + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page_is_mergeable(bv, page, len, off, same_page)) { + if (bio->bi_iter.bi_size > UINT_MAX - len) { + *same_page = false; + return false; + } + bv->bv_len += len; + bio->bi_iter.bi_size += len; + return true; + } + } + return false; +} + /* * Try to merge a page into a segment, while obeying the hardware segment * size limit. This is not for normal read/write bios, but for passthrough @@ -908,7 +965,7 @@ EXPORT_SYMBOL(bio_add_pc_page); int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); bool same_page = false; if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) @@ -923,45 +980,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page, EXPORT_SYMBOL_GPL(bio_add_zone_append_page); /** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. - * - * Warn if (@len, @off) crosses pages in case that @same_page is true. - * - * Return %true on success or %false on failure. - */ -bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) -{ - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } - } - return false; -} -EXPORT_SYMBOL_GPL(__bio_try_merge_page); - -/** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add @@ -1015,52 +1033,40 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); -void bio_release_pages(struct bio *bio, bool mark_dirty) +void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct bvec_iter_all iter_all; struct bio_vec *bvec; - if (bio_flagged(bio, BIO_NO_PAGE_REF)) - return; - bio_for_each_segment_all(bvec, bio, iter_all) { if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); put_page(bvec->bv_page); } } -EXPORT_SYMBOL_GPL(bio_release_pages); +EXPORT_SYMBOL_GPL(__bio_release_pages); -static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { + size_t size = iov_iter_count(iter); + WARN_ON_ONCE(bio->bi_max_vecs); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + size_t max_sectors = queue_max_zone_append_sectors(q); + + size = min(size, max_sectors << SECTOR_SHIFT); + } + bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = iter->count; + bio->bi_iter.bi_size = size; bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) -{ - __bio_iov_bvec_set(bio, iter); - iov_iter_advance(iter, iter->count); - return 0; -} - -static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) -{ - struct request_queue *q = bio->bi_bdev->bd_disk->queue; - struct iov_iter i = *iter; - - iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); - __bio_iov_bvec_set(bio, &i); - iov_iter_advance(iter, i.count); - return 0; -} - static void bio_put_pages(struct page **pages, size_t size, size_t off) { size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); @@ -1130,7 +1136,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1202,9 +1208,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) int ret = 0; if (iov_iter_is_bvec(iter)) { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - return bio_iov_bvec_set_append(bio, iter); - return bio_iov_bvec_set(bio, iter); + bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); + return 0; } do { @@ -1260,18 +1266,7 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); -/** - * bio_advance - increment/complete a bio by some number of bytes - * @bio: bio to advance - * @bytes: number of bytes to complete - * - * This updates bi_sector, bi_size and bi_idx; if the number of bytes to - * complete doesn't align with a bvec boundary, then bv_len and bv_offset will - * be updated on the last bvec as well. - * - * @bio will then represent the remaining, uncompleted portion of the io. - */ -void bio_advance(struct bio *bio, unsigned bytes) +void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); @@ -1279,7 +1274,7 @@ void bio_advance(struct bio *bio, unsigned bytes) bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } -EXPORT_SYMBOL(bio_advance); +EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) @@ -1467,10 +1462,10 @@ again: return; if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) - rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); + rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 38b9f7684952..88b1fce90520 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -32,6 +32,7 @@ #include <linux/psi.h> #include "blk.h" #include "blk-ioprio.h" +#include "blk-throttle.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -620,7 +621,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp) */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) + __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) { struct block_device *bdev; struct request_queue *q; @@ -631,7 +632,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (IS_ERR(bdev)) return PTR_ERR(bdev); - q = bdev->bd_disk->queue; + q = bdev_get_queue(bdev); + + /* + * blkcg_deactivate_policy() requires queue to be frozen, we can grab + * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). + */ + ret = blk_queue_enter(q, 0); + if (ret) + return ret; rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -702,6 +711,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: + blk_queue_exit(q); ctx->bdev = bdev; ctx->blkg = blkg; ctx->body = input; @@ -714,6 +724,7 @@ fail_unlock: rcu_read_unlock(); fail: blkdev_put_no_open(bdev); + blk_queue_exit(q); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -736,9 +747,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) { - spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); + spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); rcu_read_unlock(); blkdev_put_no_open(ctx->bdev); } @@ -841,7 +852,7 @@ static void blkcg_fill_root_iostats(void) while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = - blk_queue_root_blkg(bdev->bd_disk->queue); + blk_queue_root_blkg(bdev_get_queue(bdev)); struct blkg_iostat tmp; int cpu; @@ -1800,7 +1811,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), - bio->bi_bdev->bd_disk->queue); + bdev_get_queue(bio->bi_bdev)); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1836,8 +1847,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); - bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; + blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); + bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); @@ -1897,10 +1908,11 @@ void blk_cgroup_bio_start(struct bio *bio) { int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; + unsigned long flags; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); - u64_stats_update_begin(&bis->sync); + flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split @@ -1912,7 +1924,7 @@ void blk_cgroup_bio_start(struct bio *bio) } bis->cur.ios[rwd]++; - u64_stats_update_end(&bis->sync); + u64_stats_update_end_irqrestore(&bis->sync, flags); if (cgroup_subsys_on_dfl(io_cgrp_subsys)) cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); put_cpu(); diff --git a/block/blk-core.c b/block/blk-core.c index 5454db2fa263..ac1de7d73a45 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -18,6 +18,7 @@ #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-pm.h> +#include <linux/blk-integrity.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> @@ -49,7 +50,7 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" -#include "blk-rq-qos.h" +#include "blk-throttle.h" struct dentry *blk_debugfs_root; @@ -215,8 +216,7 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); -static void print_req_error(struct request *req, blk_status_t status, - const char *caller) +void blk_print_req_error(struct request *req, blk_status_t status) { int idx = (__force int)status; @@ -224,9 +224,9 @@ static void print_req_error(struct request *req, blk_status_t status, return; printk_ratelimited(KERN_ERR - "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", - caller, blk_errors[idx].name, + blk_errors[idx].name, req->rq_disk ? req->rq_disk->disk_name : "?", blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), req->cmd_flags & ~REQ_OP_MASK, @@ -234,33 +234,6 @@ static void print_req_error(struct request *req, blk_status_t status, IOPRIO_PRIO_CLASS(req->ioprio)); } -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, blk_status_t error) -{ - if (error) - bio->bi_status = error; - - if (unlikely(rq->rq_flags & RQF_QUIET)) - bio_set_flag(bio, BIO_QUIET); - - bio_advance(bio, nbytes); - - if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { - /* - * Partial zone append completions cannot be supported as the - * BIO fragments may end up not being written sequentially. - */ - if (bio->bi_iter.bi_size) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_iter.bi_sector = rq->__sector; - } - - /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) - bio_endio(bio); -} - void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, @@ -337,23 +310,25 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_set_queue_dying(struct request_queue *q) +void blk_queue_start_drain(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ blk_freeze_queue_start(q); - if (queue_is_mq(q)) blk_mq_wake_waiters(q); - /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); } + +void blk_set_queue_dying(struct request_queue *q) +{ + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); +} EXPORT_SYMBOL_GPL(blk_set_queue_dying); /** @@ -385,13 +360,8 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); - rq_qos_exit(q); - blk_queue_flag_set(QUEUE_FLAG_DEAD, q); - /* for synchronous bio-based driver finish in-flight integrity i/o */ - blk_flush_integrity(); - blk_sync_queue(q); if (queue_is_mq(q)) blk_mq_exit_queue(q); @@ -406,7 +376,7 @@ void blk_cleanup_queue(struct request_queue *q) */ mutex_lock(&q->sysfs_lock); if (q->elevator) - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); mutex_unlock(&q->sysfs_lock); percpu_ref_exit(&q->q_usage_counter); @@ -416,6 +386,30 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); +static bool blk_try_enter_queue(struct request_queue *q, bool pm) +{ + rcu_read_lock(); + if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter)) + goto fail; + + /* + * The code that increments the pm_only counter must ensure that the + * counter is globally visible before the queue is unfrozen. + */ + if (blk_queue_pm_only(q) && + (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) + goto fail_put; + + rcu_read_unlock(); + return true; + +fail_put: + blk_queue_exit(q); +fail: + rcu_read_unlock(); + return false; +} + /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -425,40 +419,18 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; - while (true) { - bool success = false; - - rcu_read_lock(); - if (percpu_ref_tryget_live(&q->q_usage_counter)) { - /* - * The code that increments the pm_only counter is - * responsible for ensuring that that counter is - * globally visible before the queue is unfrozen. - */ - if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) || - !blk_queue_pm_only(q)) { - success = true; - } else { - percpu_ref_put(&q->q_usage_counter); - } - } - rcu_read_unlock(); - - if (success) - return 0; - + while (!blk_try_enter_queue(q, pm)) { if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* - * read pair of barrier in blk_freeze_queue_start(), - * we need to order reading __PERCPU_REF_DEAD flag of - * .q_usage_counter and reading .mq_freeze_depth or - * queue dying flag, otherwise the following wait may - * never return if the two reads are reordered. + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. */ smp_rmb(); - wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || @@ -466,23 +438,44 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) if (blk_queue_dying(q)) return -ENODEV; } + + return 0; } static inline int bio_queue_enter(struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; - bool nowait = bio->bi_opf & REQ_NOWAIT; - int ret; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + while (!blk_try_enter_queue(q, false)) { + struct gendisk *disk = bio->bi_bdev->bd_disk; - ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); - if (unlikely(ret)) { - if (nowait && !blk_queue_dying(q)) + if (bio->bi_opf & REQ_NOWAIT) { + if (test_bit(GD_DEAD, &disk->state)) + goto dead; bio_wouldblock_error(bio); - else - bio_io_error(bio); + return -EBUSY; + } + + /* + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. + */ + smp_rmb(); + wait_event(q->mq_freeze_wq, + (!q->mq_freeze_depth && + blk_pm_resume_queue(false, q)) || + test_bit(GD_DEAD, &disk->state)); + if (test_bit(GD_DEAD, &disk->state)) + goto dead; } - return ret; + return 0; +dead: + bio_io_error(bio); + return -ENODEV; } void blk_queue_exit(struct request_queue *q) @@ -535,7 +528,7 @@ struct request_queue *blk_alloc_queue(int node_id) q->node = node_id; - atomic_set(&q->nr_active_requests_shared_sbitmap, 0); + atomic_set(&q->nr_active_requests_shared_tags, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); @@ -568,7 +561,7 @@ struct request_queue *blk_alloc_queue(int node_id) blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); - q->nr_requests = BLKDEV_MAX_RQ; + q->nr_requests = BLKDEV_DEFAULT_RQ; return q; @@ -604,40 +597,13 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -/** - * blk_get_request - allocate a request - * @q: request queue to allocate a request for - * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. - * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. - */ -struct request *blk_get_request(struct request_queue *q, unsigned int op, - blk_mq_req_flags_t flags) -{ - struct request *req; - - WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM)); - - req = blk_mq_alloc_request(q, op, flags); - if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) - q->mq_ops->initialize_rq_fn(req); - - return req; -} -EXPORT_SYMBOL(blk_get_request); - -void blk_put_request(struct request *req) -{ - blk_mq_free_request(req); -} -EXPORT_SYMBOL(blk_put_request); - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; - pr_info_ratelimited("attempt to access beyond end of device\n" + pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%s: rw=%d, want=%llu, limit=%llu\n", + current->comm, bio_devname(bio, b), bio->bi_opf, bio_end_sector(bio), maxsector); } @@ -779,7 +745,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; - struct request_queue *q = bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; @@ -821,7 +787,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) } if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - bio_clear_hipri(bio); + bio_clear_polled(bio); switch (bio_op(bio)) { case REQ_OP_DISCARD: @@ -894,18 +860,22 @@ end_io: return false; } -static blk_qc_t __submit_bio(struct bio *bio) +static void __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - blk_qc_t ret = BLK_QC_T_NONE; - if (blk_crypto_bio_prep(&bio)) { - if (!disk->fops->submit_bio) - return blk_mq_submit_bio(bio); - ret = disk->fops->submit_bio(bio); + if (unlikely(bio_queue_enter(bio) != 0)) + return; + + if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio)) + goto queue_exit; + if (!disk->fops->submit_bio) { + blk_mq_submit_bio(bio); + return; } + disk->fops->submit_bio(bio); +queue_exit: blk_queue_exit(disk->queue); - return ret; } /* @@ -927,10 +897,9 @@ static blk_qc_t __submit_bio(struct bio *bio) * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio_bio, but that haven't been processed yet. */ -static blk_qc_t __submit_bio_noacct(struct bio *bio) +static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; - blk_qc_t ret = BLK_QC_T_NONE; BUG_ON(bio->bi_next); @@ -938,19 +907,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_list lower, same; - if (unlikely(bio_queue_enter(bio) != 0)) - continue; - /* * Create a fresh bio_list for all subordinate requests. */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = __submit_bio(bio); + __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the @@ -959,7 +925,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_bdev->bd_disk->queue) + if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -973,33 +939,19 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; - return ret; } -static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) +static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; - blk_qc_t ret = BLK_QC_T_NONE; current->bio_list = bio_list; do { - struct gendisk *disk = bio->bi_bdev->bd_disk; - - if (unlikely(bio_queue_enter(bio) != 0)) - continue; - - if (!blk_crypto_bio_prep(&bio)) { - blk_queue_exit(disk->queue); - ret = BLK_QC_T_NONE; - continue; - } - - ret = blk_mq_submit_bio(bio); + __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; - return ret; } /** @@ -1011,25 +963,20 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) * systems and other upper level users of the block layer should use * submit_bio() instead. */ -blk_qc_t submit_bio_noacct(struct bio *bio) +void submit_bio_noacct(struct bio *bio) { - if (!submit_bio_checks(bio)) - return BLK_QC_T_NONE; - /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ - if (current->bio_list) { + if (current->bio_list) bio_list_add(¤t->bio_list[0], bio); - return BLK_QC_T_NONE; - } - - if (!bio->bi_bdev->bd_disk->fops->submit_bio) - return __submit_bio_noacct_mq(bio); - return __submit_bio_noacct(bio); + else if (!bio->bi_bdev->bd_disk->fops->submit_bio) + __submit_bio_noacct_mq(bio); + else + __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); @@ -1046,10 +993,10 @@ EXPORT_SYMBOL(submit_bio_noacct); * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has * been called. */ -blk_qc_t submit_bio(struct bio *bio) +void submit_bio(struct bio *bio) { if (blkcg_punt_bio_submit(bio)) - return BLK_QC_T_NONE; + return; /* * If it's a regular read/write or a barrier with data attached, @@ -1060,7 +1007,7 @@ blk_qc_t submit_bio(struct bio *bio) if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) count = queue_logical_block_size( - bio->bi_bdev->bd_disk->queue) >> 9; + bdev_get_queue(bio->bi_bdev)) >> 9; else count = bio_sectors(bio); @@ -1081,20 +1028,93 @@ blk_qc_t submit_bio(struct bio *bio) if (unlikely(bio_op(bio) == REQ_OP_READ && bio_flagged(bio, BIO_WORKINGSET))) { unsigned long pflags; - blk_qc_t ret; psi_memstall_enter(&pflags); - ret = submit_bio_noacct(bio); + submit_bio_noacct(bio); psi_memstall_leave(&pflags); - - return ret; + return; } - return submit_bio_noacct(bio); + submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** + * bio_poll - poll for BIO completions + * @bio: bio to poll for + * @flags: BLK_POLL_* flags that control the behavior + * + * Poll for completions on queue associated with the bio. Returns number of + * completed entries found. + * + * Note: the caller must either be the context that submitted @bio, or + * be in a RCU critical section to prevent freeing of @bio. + */ +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + blk_qc_t cookie = READ_ONCE(bio->bi_cookie); + int ret; + + if (cookie == BLK_QC_T_NONE || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; + + if (current->plug) + blk_flush_plug(current->plug, false); + + if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) + return 0; + if (WARN_ON_ONCE(!queue_is_mq(q))) + ret = 0; /* not yet implemented, should not happen */ + else + ret = blk_mq_poll(q, cookie, iob, flags); + blk_queue_exit(q); + return ret; +} +EXPORT_SYMBOL_GPL(bio_poll); + +/* + * Helper to implement file_operations.iopoll. Requires the bio to be stored + * in iocb->private, and cleared before freeing the bio. + */ +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags) +{ + struct bio *bio; + int ret = 0; + + /* + * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can + * point to a freshly allocated bio at this point. If that happens + * we have a few cases to consider: + * + * 1) the bio is beeing initialized and bi_bdev is NULL. We can just + * simply nothing in this case + * 2) the bio points to a not poll enabled device. bio_poll will catch + * this and return 0 + * 3) the bio points to a poll capable device, including but not + * limited to the one that the original bio pointed to. In this + * case we will call into the actual poll method and poll for I/O, + * even if we don't need to, but it won't cause harm either. + * + * For cases 2) and 3) above the RCU grace period ensures that bi_bdev + * is still allocated. Because partitions hold a reference to the whole + * device bdev and thus disk, the disk is also still valid. Grabbing + * a reference to the queue in bio_poll() ensures the hctxs and requests + * are still valid as well. + */ + rcu_read_lock(); + bio = READ_ONCE(kiocb->private); + if (bio && bio->bi_bdev) + ret = bio_poll(bio, iob, flags); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(iocb_bio_iopoll); + +/** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for the new queue limits * @q: the queue @@ -1169,8 +1189,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * if (blk_crypto_insert_cloned_request(rq)) return BLK_STS_IOERR; - if (blk_queue_io_stat(q)) - blk_account_io_start(rq); + blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, @@ -1238,41 +1257,19 @@ again: } } -static void blk_account_io_completion(struct request *req, unsigned int bytes) +void __blk_account_io_done(struct request *req, u64 now) { - if (req->part && blk_do_io_stat(req)) { - const int sgrp = op_stat_group(req_op(req)); + const int sgrp = op_stat_group(req_op(req)); - part_stat_lock(); - part_stat_add(req->part, sectors[sgrp], bytes >> 9); - part_stat_unlock(); - } -} - -void blk_account_io_done(struct request *req, u64 now) -{ - /* - * Account IO completion. flush_rq isn't accounted as a - * normal IO on queueing nor completion. Accounting the - * containing request is enough. - */ - if (req->part && blk_do_io_stat(req) && - !(req->rq_flags & RQF_FLUSH_SEQ)) { - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - update_io_ticks(req->part, jiffies, true); - part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); - } + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); } -void blk_account_io_start(struct request *rq) +void __blk_account_io_start(struct request *rq) { - if (!blk_do_io_stat(rq)) - return; - /* passthrough requests can hold bios that do not have ->bi_bdev set */ if (rq->bio && rq->bio->bi_bdev) rq->part = rq->bio->bi_bdev; @@ -1368,112 +1365,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq) } EXPORT_SYMBOL_GPL(blk_steal_bios); -/** - * blk_update_request - Complete multiple bytes without completing the request - * @req: the request being processed - * @error: block status code - * @nr_bytes: number of bytes to complete for @req - * - * Description: - * Ends I/O on a number of bytes attached to @req, but doesn't complete - * the request structure even if @req doesn't have leftover. - * If @req has leftover, sets it up for the next range of segments. - * - * Passing the result of blk_rq_bytes() as @nr_bytes guarantees - * %false return from this function. - * - * Note: - * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function - * except in the consistency check at the end of this function. - * - * Return: - * %false - this request doesn't have any more data - * %true - this request has more data - **/ -bool blk_update_request(struct request *req, blk_status_t error, - unsigned int nr_bytes) -{ - int total_bytes; - - trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); - - if (!req->bio) - return false; - -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif - - if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET))) - print_req_error(req, error, __func__); - - blk_account_io_completion(req, nr_bytes); - - total_bytes = 0; - while (req->bio) { - struct bio *bio = req->bio; - unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - - if (bio_bytes == bio->bi_iter.bi_size) - req->bio = bio->bi_next; - - /* Completion has already been traced */ - bio_clear_flag(bio, BIO_TRACE_COMPLETION); - req_bio_endio(req, bio, bio_bytes, error); - - total_bytes += bio_bytes; - nr_bytes -= bio_bytes; - - if (!nr_bytes) - break; - } - - /* - * completely done - */ - if (!req->bio) { - /* - * Reset counters so that the request stacking driver - * can find how many bytes remain in the request - * later. - */ - req->__data_len = 0; - return false; - } - - req->__data_len -= total_bytes; - - /* update sector only for requests with clear definition of sector */ - if (!blk_rq_is_passthrough(req)) - req->__sector += total_bytes >> 9; - - /* mixed attributes always follow the first bio */ - if (req->rq_flags & RQF_MIXED_MERGE) { - req->cmd_flags &= ~REQ_FAILFAST_MASK; - req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; - } - - if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { - /* - * If total number of sectors is less than the first segment - * size, something has gone terribly wrong. - */ - if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { - blk_dump_rq_flags(req, "request botched"); - req->__data_len = blk_rq_cur_bytes(req); - } - - /* recalculate the number of segments */ - req->nr_phys_segments = blk_recalc_rq_segments(req); - } - - return true; -} -EXPORT_SYMBOL_GPL(blk_update_request); - #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE /** * rq_flush_dcache_pages - Helper function to flush all pages in a request @@ -1621,6 +1512,32 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); +void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) +{ + struct task_struct *tsk = current; + + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + + plug->mq_list = NULL; + plug->cached_rq = NULL; + plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); + plug->rq_count = 0; + plug->multiple_queues = false; + plug->has_elevator = false; + plug->nowait = false; + INIT_LIST_HEAD(&plug->cb_list); + + /* + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier + */ + tsk->plug = plug; +} + /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized @@ -1646,25 +1563,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on); */ void blk_start_plug(struct blk_plug *plug) { - struct task_struct *tsk = current; - - /* - * If this is a nested plug, don't actually assign it. - */ - if (tsk->plug) - return; - - INIT_LIST_HEAD(&plug->mq_list); - INIT_LIST_HEAD(&plug->cb_list); - plug->rq_count = 0; - plug->multiple_queues = false; - plug->nowait = false; - - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; + blk_start_plug_nr_ios(plug, 1); } EXPORT_SYMBOL(blk_start_plug); @@ -1710,12 +1609,14 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, } EXPORT_SYMBOL(blk_check_plugged); -void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) +void blk_flush_plug(struct blk_plug *plug, bool from_schedule) { - flush_plug_callbacks(plug, from_schedule); - - if (!list_empty(&plug->mq_list)) + if (!list_empty(&plug->cb_list)) + flush_plug_callbacks(plug, from_schedule); + if (!rq_list_empty(plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); + if (unlikely(!from_schedule && plug->cached_rq)) + blk_mq_free_plug_rqs(plug); } /** @@ -1730,11 +1631,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ void blk_finish_plug(struct blk_plug *plug) { - if (plug != current->plug) - return; - blk_flush_plug_list(plug, false); - - current->plug = NULL; + if (plug == current->plug) { + blk_flush_plug(plug, false); + current->plug = NULL; + } } EXPORT_SYMBOL(blk_finish_plug); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c322176a1e09..c87aba8584c6 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -12,12 +12,13 @@ #include <crypto/skcipher.h> #include <linux/blk-cgroup.h> #include <linux/blk-crypto.h> +#include <linux/blk-crypto-profile.h> #include <linux/blkdev.h> #include <linux/crypto.h> -#include <linux/keyslot-manager.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/random.h> +#include <linux/scatterlist.h> #include "blk-crypto-internal.h" @@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool; static DEFINE_MUTEX(tfms_init_lock); static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; -static struct blk_crypto_keyslot { +static struct blk_crypto_fallback_keyslot { enum blk_crypto_mode_num crypto_mode; struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; -static struct blk_keyslot_manager blk_crypto_ksm; +static struct blk_crypto_profile blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; static struct bio_set crypto_bio_split; @@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split; */ static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; -static void blk_crypto_evict_keyslot(unsigned int slot) +static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; int err; @@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot) slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; } -static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int +blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; const enum blk_crypto_mode_num crypto_mode = key->crypto_cfg.crypto_mode; int err; if (crypto_mode != slotp->crypto_mode && slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, key->size); if (err) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return err; } return 0; } -static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return 0; } -/* - * The crypto API fallback KSM ops - only used for a bio when it specifies a - * blk_crypto_key that was not supported by the device's inline encryption - * hardware. - */ -static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { - .keyslot_program = blk_crypto_keyslot_program, - .keyslot_evict = blk_crypto_keyslot_evict, +static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = { + .keyslot_program = blk_crypto_fallback_keyslot_program, + .keyslot_evict = blk_crypto_fallback_keyslot_evict, }; static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) @@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) bio_endio(src_bio); } -static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { struct bvec_iter iter; struct bio_vec bv; @@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) return bio; } -static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, - struct skcipher_request **ciph_req_ret, - struct crypto_wait *wait) +static bool +blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) { struct skcipher_request *ciph_req; - const struct blk_crypto_keyslot *slotp; - int keyslot_idx = blk_ksm_get_slot_idx(slot); + const struct blk_crypto_fallback_keyslot *slotp; + int keyslot_idx = blk_crypto_keyslot_index(slot); slotp = &blk_crypto_keyslots[keyslot_idx]; ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], @@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, return true; } -static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; unsigned int i = 0; @@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) { struct bio *src_bio, *enc_bio; struct bio_crypt_ctx *bc; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; int data_unit_size; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); @@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) blk_status_t blk_st; /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_split_bio_if_needed(bio_ptr)) + if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr)) return false; src_bio = *bio_ptr; @@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; /* Allocate bounce bio for encryption */ - enc_bio = blk_crypto_clone_bio(src_bio); + enc_bio = blk_crypto_fallback_clone_bio(src_bio); if (!enc_bio) { src_bio->bi_status = BLK_STS_RESOURCE; return false; } /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { src_bio->bi_status = blk_st; goto out_put_enc_bio; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { src_bio->bi_status = BLK_STS_RESOURCE; goto out_release_keyslot; } @@ -362,7 +361,7 @@ out_free_bounce_pages: out_free_ciph_req: skcipher_request_free(ciph_req); out_release_keyslot: - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) bio_put(enc_bio); @@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) container_of(work, struct bio_fallback_crypt_ctx, work); struct bio *bio = f_ctx->bio; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; @@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) blk_status_t blk_st; /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { bio->bi_status = blk_st; goto out_no_keyslot; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { bio->bi_status = BLK_STS_RESOURCE; goto out; } @@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) out: skcipher_request_free(ciph_req); - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_no_keyslot: mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); bio_endio(bio); @@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio) * @bio_ptr: pointer to the bio to prepare * * If bio is doing a WRITE operation, this splits the bio into two parts if it's - * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio - * for the first part, encrypts it, and update bio_ptr to point to the bounce - * bio. + * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a + * bounce bio for the first part, encrypts it, and updates bio_ptr to point to + * the bounce bio. * * For a READ operation, we mark the bio for decryption by using bi_private and * bi_end_io. @@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) return false; } - if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, - &bc->bc_key->crypto_cfg)) { + if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile, + &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; return false; } @@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { - return blk_ksm_evict_key(&blk_crypto_ksm, key); + return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key); } static bool blk_crypto_fallback_inited; @@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void) { int i; int err; + struct blk_crypto_profile *profile = &blk_crypto_fallback_profile; if (blk_crypto_fallback_inited) return 0; @@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void) if (err) goto out; - err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots); if (err) goto fail_free_bioset; err = -ENOMEM; - blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; - blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + profile->ll_ops = blk_crypto_fallback_ll_ops; + profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) - blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; - blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + profile->modes_supported[i] = 0xFFFFFFFF; + profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; blk_crypto_wq = alloc_workqueue("blk_crypto_wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, num_online_cpus()); if (!blk_crypto_wq) - goto fail_free_ksm; + goto fail_destroy_profile; blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, sizeof(blk_crypto_keyslots[0]), @@ -595,8 +596,8 @@ fail_free_keyslots: kfree(blk_crypto_keyslots); fail_free_wq: destroy_workqueue(blk_crypto_wq); -fail_free_ksm: - blk_ksm_destroy(&blk_crypto_ksm); +fail_destroy_profile: + blk_crypto_profile_destroy(profile); fail_free_bioset: bioset_exit(&crypto_bio_split); out: @@ -610,7 +611,7 @@ out: int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; - struct blk_crypto_keyslot *slotp; + struct blk_crypto_fallback_keyslot *slotp; unsigned int i; int err = 0; diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 0d36aae538d7..2fb0d65a464c 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -7,7 +7,7 @@ #define __LINUX_BLK_CRYPTO_INTERNAL_H #include <linux/bio.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c new file mode 100644 index 000000000000..605ba0626a5c --- /dev/null +++ b/block/blk-crypto-profile.c @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/** + * DOC: blk-crypto profiles + * + * 'struct blk_crypto_profile' contains all generic inline encryption-related + * state for a particular inline encryption device. blk_crypto_profile serves + * as the way that drivers for inline encryption hardware expose their crypto + * capabilities and certain functions (e.g., functions to program and evict + * keys) to upper layers. Device drivers that want to support inline encryption + * construct a crypto profile, then associate it with the disk's request_queue. + * + * If the device has keyslots, then its blk_crypto_profile also handles managing + * these keyslots in a device-independent way, using the driver-provided + * functions to program and evict keys as needed. This includes keeping track + * of which key and how many I/O requests are using each keyslot, getting + * keyslots for I/O requests, and handling key eviction requests. + * + * For more information, see Documentation/block/inline-encryption.rst. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include <linux/blk-crypto-profile.h> +#include <linux/device.h> +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/pm_runtime.h> +#include <linux/wait.h> +#include <linux/blkdev.h> +#include <linux/blk-integrity.h> + +struct blk_crypto_keyslot { + atomic_t slot_refs; + struct list_head idle_slot_node; + struct hlist_node hash_node; + const struct blk_crypto_key *key; + struct blk_crypto_profile *profile; +}; + +static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile) +{ + /* + * Calling into the driver requires profile->lock held and the device + * resumed. But we must resume the device first, since that can acquire + * and release profile->lock via blk_crypto_reprogram_all_keys(). + */ + if (profile->dev) + pm_runtime_get_sync(profile->dev); + down_write(&profile->lock); +} + +static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile) +{ + up_write(&profile->lock); + if (profile->dev) + pm_runtime_put_sync(profile->dev); +} + +/** + * blk_crypto_profile_init() - Initialize a blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Storage drivers must call this when starting to set up a blk_crypto_profile, + * before filling in additional fields. + * + * Return: 0 on success, or else a negative error code. + */ +int blk_crypto_profile_init(struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + unsigned int slot; + unsigned int i; + unsigned int slot_hashtable_size; + + memset(profile, 0, sizeof(*profile)); + init_rwsem(&profile->lock); + + if (num_slots == 0) + return 0; + + /* Initialize keyslot management data. */ + + profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]), + GFP_KERNEL); + if (!profile->slots) + return -ENOMEM; + + profile->num_slots = num_slots; + + init_waitqueue_head(&profile->idle_slots_wait_queue); + INIT_LIST_HEAD(&profile->idle_slots); + + for (slot = 0; slot < num_slots; slot++) { + profile->slots[slot].profile = profile; + list_add_tail(&profile->slots[slot].idle_slot_node, + &profile->idle_slots); + } + + spin_lock_init(&profile->idle_slots_lock); + + slot_hashtable_size = roundup_pow_of_two(num_slots); + /* + * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 + * buckets. This only makes a difference when there is only 1 keyslot. + */ + if (slot_hashtable_size < 2) + slot_hashtable_size = 2; + + profile->log_slot_ht_size = ilog2(slot_hashtable_size); + profile->slot_hashtable = + kvmalloc_array(slot_hashtable_size, + sizeof(profile->slot_hashtable[0]), GFP_KERNEL); + if (!profile->slot_hashtable) + goto err_destroy; + for (i = 0; i < slot_hashtable_size; i++) + INIT_HLIST_HEAD(&profile->slot_hashtable[i]); + + return 0; + +err_destroy: + blk_crypto_profile_destroy(profile); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_init); + +static void blk_crypto_profile_destroy_callback(void *profile) +{ + blk_crypto_profile_destroy(profile); +} + +/** + * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init() + * @dev: the device which owns the blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be + * called automatically on driver detach. + * + * Return: 0 on success, or else a negative error code. + */ +int devm_blk_crypto_profile_init(struct device *dev, + struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + int err = blk_crypto_profile_init(profile, num_slots); + + if (err) + return err; + + return devm_add_action_or_reset(dev, + blk_crypto_profile_destroy_callback, + profile); +} +EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init); + +static inline struct hlist_head * +blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + return &profile->slot_hashtable[ + hash_ptr(key, profile->log_slot_ht_size)]; +} + +static void +blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile = slot->profile; + unsigned long flags; + + spin_lock_irqsave(&profile->idle_slots_lock, flags); + list_del(&slot->idle_slot_node); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); +} + +static struct blk_crypto_keyslot * +blk_crypto_find_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + const struct hlist_head *head = + blk_crypto_hash_bucket_for_key(profile, key); + struct blk_crypto_keyslot *slotp; + + hlist_for_each_entry(slotp, head, hash_node) { + if (slotp->key == key) + return slotp; + } + return NULL; +} + +static struct blk_crypto_keyslot * +blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) + return NULL; + if (atomic_inc_return(&slot->slot_refs) == 1) { + /* Took first reference to this slot; remove it from LRU list */ + blk_crypto_remove_slot_from_lru_list(slot); + } + return slot; +} + +/** + * blk_crypto_keyslot_index() - Get the index of a keyslot + * @slot: a keyslot that blk_crypto_get_keyslot() returned + * + * Return: the 0-based index of the keyslot within the device's keyslots. + */ +unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot) +{ + return slot - slot->profile->slots; +} +EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index); + +/** + * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed. + * @profile: the crypto profile of the device the key will be used on + * @key: the key that will be used + * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct + * will be stored here; otherwise NULL will be stored here. + * + * If the device has keyslots, this gets a keyslot that's been programmed with + * the specified key. If the key is already in a slot, this reuses it; + * otherwise this waits for a slot to become idle and programs the key into it. + * + * This must be paired with a call to blk_crypto_put_keyslot(). + * + * Context: Process context. Takes and releases profile->lock. + * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or + * one wasn't needed; or a blk_status_t error on failure. + */ +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr) +{ + struct blk_crypto_keyslot *slot; + int slot_idx; + int err; + + *slot_ptr = NULL; + + /* + * If the device has no concept of "keyslots", then there is no need to + * get one. + */ + if (profile->num_slots == 0) + return BLK_STS_OK; + + down_read(&profile->lock); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + up_read(&profile->lock); + if (slot) + goto success; + + for (;;) { + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + if (slot) { + blk_crypto_hw_exit(profile); + goto success; + } + + /* + * If we're here, that means there wasn't a slot that was + * already programmed with the key. So try to program it. + */ + if (!list_empty(&profile->idle_slots)) + break; + + blk_crypto_hw_exit(profile); + wait_event(profile->idle_slots_wait_queue, + !list_empty(&profile->idle_slots)); + } + + slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot, + idle_slot_node); + slot_idx = blk_crypto_keyslot_index(slot); + + err = profile->ll_ops.keyslot_program(profile, key, slot_idx); + if (err) { + wake_up(&profile->idle_slots_wait_queue); + blk_crypto_hw_exit(profile); + return errno_to_blk_status(err); + } + + /* Move this slot to the hash list for the new key. */ + if (slot->key) + hlist_del(&slot->hash_node); + slot->key = key; + hlist_add_head(&slot->hash_node, + blk_crypto_hash_bucket_for_key(profile, key)); + + atomic_set(&slot->slot_refs, 1); + + blk_crypto_remove_slot_from_lru_list(slot); + + blk_crypto_hw_exit(profile); +success: + *slot_ptr = slot; + return BLK_STS_OK; +} + +/** + * blk_crypto_put_keyslot() - Release a reference to a keyslot + * @slot: The keyslot to release the reference of (may be NULL). + * + * Context: Any context. + */ +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile; + unsigned long flags; + + if (!slot) + return; + + profile = slot->profile; + + if (atomic_dec_and_lock_irqsave(&slot->slot_refs, + &profile->idle_slots_lock, flags)) { + list_add_tail(&slot->idle_slot_node, &profile->idle_slots); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); + wake_up(&profile->idle_slots_wait_queue); + } +} + +/** + * __blk_crypto_cfg_supported() - Check whether the given crypto profile + * supports the given crypto configuration. + * @profile: the crypto profile to check + * @cfg: the crypto configuration to check for + * + * Return: %true if @profile supports the given @cfg. + */ +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg) +{ + if (!profile) + return false; + if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size)) + return false; + if (profile->max_dun_bytes_supported < cfg->dun_bytes) + return false; + return true; +} + +/** + * __blk_crypto_evict_key() - Evict a key from a device. + * @profile: the crypto profile of the device + * @key: the key to evict. It must not still be used in any I/O. + * + * If the device has keyslots, this finds the keyslot (if any) that contains the + * specified key and calls the driver's keyslot_evict function to evict it. + * + * Otherwise, this just calls the driver's keyslot_evict function if it is + * implemented, passing just the key (without any particular keyslot). This + * allows layered devices to evict the key from their underlying devices. + * + * Context: Process context. Takes and releases profile->lock. + * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY + * if the keyslot is still in use, or another -errno value on other + * error. + */ +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + int err = 0; + + if (profile->num_slots == 0) { + if (profile->ll_ops.keyslot_evict) { + blk_crypto_hw_enter(profile); + err = profile->ll_ops.keyslot_evict(profile, key, -1); + blk_crypto_hw_exit(profile); + return err; + } + return 0; + } + + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) + goto out_unlock; + + if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + err = -EBUSY; + goto out_unlock; + } + err = profile->ll_ops.keyslot_evict(profile, key, + blk_crypto_keyslot_index(slot)); + if (err) + goto out_unlock; + + hlist_del(&slot->hash_node); + slot->key = NULL; + err = 0; +out_unlock: + blk_crypto_hw_exit(profile); + return err; +} + +/** + * blk_crypto_reprogram_all_keys() - Re-program all keyslots. + * @profile: The crypto profile + * + * Re-program all keyslots that are supposed to have a key programmed. This is + * intended only for use by drivers for hardware that loses its keys on reset. + * + * Context: Process context. Takes and releases profile->lock. + */ +void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile) +{ + unsigned int slot; + + if (profile->num_slots == 0) + return; + + /* This is for device initialization, so don't resume the device */ + down_write(&profile->lock); + for (slot = 0; slot < profile->num_slots; slot++) { + const struct blk_crypto_key *key = profile->slots[slot].key; + int err; + + if (!key) + continue; + + err = profile->ll_ops.keyslot_program(profile, key, slot); + WARN_ON(err); + } + up_write(&profile->lock); +} +EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys); + +void blk_crypto_profile_destroy(struct blk_crypto_profile *profile) +{ + if (!profile) + return; + kvfree(profile->slot_hashtable); + kvfree_sensitive(profile->slots, + sizeof(profile->slots[0]) * profile->num_slots); + memzero_explicit(profile, sizeof(*profile)); +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy); + +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) +{ + if (blk_integrity_queue_supports_integrity(q)) { + pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + return false; + } + q->crypto_profile = profile; + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_register); + +void blk_crypto_unregister(struct request_queue *q) +{ + q->crypto_profile = NULL; +} + +/** + * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities + * by child device + * @parent: the crypto profile for the parent device + * @child: the crypto profile for the child device, or NULL + * + * This clears all crypto capabilities in @parent that aren't set in @child. If + * @child is NULL, then this clears all parent capabilities. + * + * Only use this when setting up the crypto profile for a layered device, before + * it's been exposed yet. + */ +void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, + const struct blk_crypto_profile *child) +{ + if (child) { + unsigned int i; + + parent->max_dun_bytes_supported = + min(parent->max_dun_bytes_supported, + child->max_dun_bytes_supported); + for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) + parent->modes_supported[i] &= child->modes_supported[i]; + } else { + parent->max_dun_bytes_supported = 0; + memset(parent->modes_supported, 0, + sizeof(parent->modes_supported)); + } +} +EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); + +/** + * blk_crypto_has_capabilities() - Check whether @target supports at least all + * the crypto capabilities that @reference does. + * @target: the target profile + * @reference: the reference profile + * + * Return: %true if @target supports all the crypto capabilities of @reference. + */ +bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, + const struct blk_crypto_profile *reference) +{ + int i; + + if (!reference) + return true; + + if (!target) + return false; + + for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) { + if (reference->modes_supported[i] & ~target->modes_supported[i]) + return false; + } + + if (reference->max_dun_bytes_supported > + target->max_dun_bytes_supported) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); + +/** + * blk_crypto_update_capabilities() - Update the capabilities of a crypto + * profile to match those of another crypto + * profile. + * @dst: The crypto profile whose capabilities to update. + * @src: The crypto profile whose capabilities this function will update @dst's + * capabilities to. + * + * Blk-crypto requires that crypto capabilities that were + * advertised when a bio was created continue to be supported by the + * device until that bio is ended. This is turn means that a device cannot + * shrink its advertised crypto capabilities without any explicit + * synchronization with upper layers. So if there's no such explicit + * synchronization, @src must support all the crypto capabilities that + * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)). + * + * Note also that as long as the crypto capabilities are being expanded, the + * order of updates becoming visible is not important because it's alright + * for blk-crypto to see stale values - they only cause blk-crypto to + * believe that a crypto capability isn't supported when it actually is (which + * might result in blk-crypto-fallback being used if available, or the bio being + * failed). + */ +void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, + const struct blk_crypto_profile *src) +{ + memcpy(dst->modes_supported, src->modes_supported, + sizeof(dst->modes_supported)); + + dst->max_dun_bytes_supported = src->max_dun_bytes_supported; +} +EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 103c2e2d50d6..ec9efeeeca91 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -11,7 +11,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> #include <linux/module.h> #include <linux/slab.h> @@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio) blk_status_t __blk_crypto_init_request(struct request *rq) { - return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, - &rq->crypt_keyslot); + return blk_crypto_get_keyslot(rq->q->crypto_profile, + rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); } /** @@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq) */ void __blk_crypto_free_request(struct request *rq) { - blk_ksm_put_slot(rq->crypt_keyslot); + blk_crypto_put_keyslot(rq->crypt_keyslot); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); blk_crypto_rq_set_defaults(rq); } @@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + struct blk_crypto_profile *profile; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { @@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, - &bc_key->crypto_cfg)) + profile = bdev_get_queue(bio->bi_bdev)->crypto_profile; + if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg)) return true; if (blk_crypto_fallback_bio_prep(bio_ptr)) @@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_ksm_crypto_cfg_supported(q->ksm, cfg); + __blk_crypto_cfg_supported(q->crypto_profile, cfg); } /** @@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q, int blk_crypto_start_using_key(const struct blk_crypto_key *key, struct request_queue *q) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, * evicted from any hardware that it might have been programmed into. The key * must not be in use by any in-flight IO when this function is called. * - * Return: 0 on success or if key is not present in the q's ksm, -err on error. + * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. */ int blk_crypto_evict_key(struct request_queue *q, const struct blk_crypto_key *key) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) - return blk_ksm_evict_key(q->ksm, key); + if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) + return __blk_crypto_evict_key(q->crypto_profile, key); /* - * If the request queue's associated inline encryption hardware didn't - * have support for the key, then the key might have been programmed - * into the fallback keyslot manager, so try to evict from there. + * If the request_queue didn't support the key, then blk-crypto-fallback + * may have been used, so try to evict the key from blk-crypto-fallback. */ return blk_crypto_fallback_evict_key(key); } diff --git a/block/blk-exec.c b/block/blk-exec.c index d6cd501c0d34..1b8b47f6e79b 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,13 +65,19 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); static bool blk_rq_is_poll(struct request *rq) { - return rq->mq_hctx && rq->mq_hctx->type == HCTX_TYPE_POLL; + if (!rq->mq_hctx) + return false; + if (rq->mq_hctx->type != HCTX_TYPE_POLL) + return false; + if (WARN_ON_ONCE(!rq->bio)) + return false; + return true; } static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), true); + bio_poll(rq->bio, NULL, 0); cond_resched(); } while (!completion_done(wait)); } diff --git a/block/blk-flush.c b/block/blk-flush.c index 4201728bf3a5..8e364bda5166 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. */ -void blk_insert_flush(struct request *rq) +bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; unsigned long fflags = q->queue_flags; /* may change, cache */ @@ -409,7 +409,7 @@ void blk_insert_flush(struct request *rq) */ if (!policy) { blk_mq_end_request(rq, 0); - return; + return true; } BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ @@ -420,10 +420,8 @@ void blk_insert_flush(struct request *rq) * for normal execution. */ if ((policy & REQ_FSEQ_DATA) && - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - blk_mq_request_bypass_insert(rq, false, false); - return; - } + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) + return false; /* * @rq should go through flush machinery. Mark it part of flush @@ -439,6 +437,8 @@ void blk_insert_flush(struct request *rq) spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); + + return true; } /** diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c new file mode 100644 index 000000000000..c246c425d0d7 --- /dev/null +++ b/block/blk-ia-ranges.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block device concurrent positioning ranges. + * + * Copyright (C) 2021 Western Digital Corporation or its Affiliates. + */ +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/init.h> + +#include "blk.h" + +static ssize_t +blk_ia_range_sector_show(struct blk_independent_access_range *iar, + char *buf) +{ + return sprintf(buf, "%llu\n", iar->sector); +} + +static ssize_t +blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar, + char *buf) +{ + return sprintf(buf, "%llu\n", iar->nr_sectors); +} + +struct blk_ia_range_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_independent_access_range *iar, char *buf); +}; + +static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { + .attr = { .name = "sector", .mode = 0444 }, + .show = blk_ia_range_sector_show, +}; + +static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { + .attr = { .name = "nr_sectors", .mode = 0444 }, + .show = blk_ia_range_nr_sectors_show, +}; + +static struct attribute *blk_ia_range_attrs[] = { + &blk_ia_range_sector_entry.attr, + &blk_ia_range_nr_sectors_entry.attr, + NULL, +}; +ATTRIBUTE_GROUPS(blk_ia_range); + +static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct blk_ia_range_sysfs_entry *entry = + container_of(attr, struct blk_ia_range_sysfs_entry, attr); + struct blk_independent_access_range *iar = + container_of(kobj, struct blk_independent_access_range, kobj); + ssize_t ret; + + mutex_lock(&iar->queue->sysfs_lock); + ret = entry->show(iar, buf); + mutex_unlock(&iar->queue->sysfs_lock); + + return ret; +} + +static const struct sysfs_ops blk_ia_range_sysfs_ops = { + .show = blk_ia_range_sysfs_show, +}; + +/* + * Independent access range entries are not freed individually, but alltogether + * with struct blk_independent_access_ranges and its array of ranges. Since + * kobject_add() takes a reference on the parent kobject contained in + * struct blk_independent_access_ranges, the array of independent access range + * entries cannot be freed until kobject_del() is called for all entries. + * So we do not need to do anything here, but still need this no-op release + * operation to avoid complaints from the kobject code. + */ +static void blk_ia_range_sysfs_nop_release(struct kobject *kobj) +{ +} + +static struct kobj_type blk_ia_range_ktype = { + .sysfs_ops = &blk_ia_range_sysfs_ops, + .default_groups = blk_ia_range_groups, + .release = blk_ia_range_sysfs_nop_release, +}; + +/* + * This will be executed only after all independent access range entries are + * removed with kobject_del(), at which point, it is safe to free everything, + * including the array of ranges. + */ +static void blk_ia_ranges_sysfs_release(struct kobject *kobj) +{ + struct blk_independent_access_ranges *iars = + container_of(kobj, struct blk_independent_access_ranges, kobj); + + kfree(iars); +} + +static struct kobj_type blk_ia_ranges_ktype = { + .release = blk_ia_ranges_sysfs_release, +}; + +/** + * disk_register_ia_ranges - register with sysfs a set of independent + * access ranges + * @disk: Target disk + * @new_iars: New set of independent access ranges + * + * Register with sysfs a set of independent access ranges for @disk. + * If @new_iars is not NULL, this set of ranges is registered and the old set + * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is + * registered if it is not already. + */ +int disk_register_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *new_iars) +{ + struct request_queue *q = disk->queue; + struct blk_independent_access_ranges *iars; + int i, ret; + + lockdep_assert_held(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_lock); + + /* If a new range set is specified, unregister the old one */ + if (new_iars) { + if (q->ia_ranges) + disk_unregister_independent_access_ranges(disk); + q->ia_ranges = new_iars; + } + + iars = q->ia_ranges; + if (!iars) + return 0; + + /* + * At this point, iars is the new set of sector access ranges that needs + * to be registered with sysfs. + */ + WARN_ON(iars->sysfs_registered); + ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, + &q->kobj, "%s", "independent_access_ranges"); + if (ret) { + q->ia_ranges = NULL; + kfree(iars); + return ret; + } + + for (i = 0; i < iars->nr_ia_ranges; i++) { + iars->ia_range[i].queue = q; + ret = kobject_init_and_add(&iars->ia_range[i].kobj, + &blk_ia_range_ktype, &iars->kobj, + "%d", i); + if (ret) { + while (--i >= 0) + kobject_del(&iars->ia_range[i].kobj); + kobject_del(&iars->kobj); + kobject_put(&iars->kobj); + return ret; + } + } + + iars->sysfs_registered = true; + + return 0; +} + +void disk_unregister_independent_access_ranges(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct blk_independent_access_ranges *iars = q->ia_ranges; + int i; + + lockdep_assert_held(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_lock); + + if (!iars) + return; + + if (iars->sysfs_registered) { + for (i = 0; i < iars->nr_ia_ranges; i++) + kobject_del(&iars->ia_range[i].kobj); + kobject_del(&iars->kobj); + kobject_put(&iars->kobj); + } else { + kfree(iars); + } + + q->ia_ranges = NULL; +} + +static struct blk_independent_access_range * +disk_find_ia_range(struct blk_independent_access_ranges *iars, + sector_t sector) +{ + struct blk_independent_access_range *iar; + int i; + + for (i = 0; i < iars->nr_ia_ranges; i++) { + iar = &iars->ia_range[i]; + if (sector >= iar->sector && + sector < iar->sector + iar->nr_sectors) + return iar; + } + + return NULL; +} + +static bool disk_check_ia_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars) +{ + struct blk_independent_access_range *iar, *tmp; + sector_t capacity = get_capacity(disk); + sector_t sector = 0; + int i; + + /* + * While sorting the ranges in increasing LBA order, check that the + * ranges do not overlap, that there are no sector holes and that all + * sectors belong to one range. + */ + for (i = 0; i < iars->nr_ia_ranges; i++) { + tmp = disk_find_ia_range(iars, sector); + if (!tmp || tmp->sector != sector) { + pr_warn("Invalid non-contiguous independent access ranges\n"); + return false; + } + + iar = &iars->ia_range[i]; + if (tmp != iar) { + swap(iar->sector, tmp->sector); + swap(iar->nr_sectors, tmp->nr_sectors); + } + + sector += iar->nr_sectors; + } + + if (sector != capacity) { + pr_warn("Independent access ranges do not match disk capacity\n"); + return false; + } + + return true; +} + +static bool disk_ia_ranges_changed(struct gendisk *disk, + struct blk_independent_access_ranges *new) +{ + struct blk_independent_access_ranges *old = disk->queue->ia_ranges; + int i; + + if (!old) + return true; + + if (old->nr_ia_ranges != new->nr_ia_ranges) + return true; + + for (i = 0; i < old->nr_ia_ranges; i++) { + if (new->ia_range[i].sector != old->ia_range[i].sector || + new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors) + return true; + } + + return false; +} + +/** + * disk_alloc_independent_access_ranges - Allocate an independent access ranges + * data structure + * @disk: target disk + * @nr_ia_ranges: Number of independent access ranges + * + * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges + * access range descriptors. + */ +struct blk_independent_access_ranges * +disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges) +{ + struct blk_independent_access_ranges *iars; + + iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges), + GFP_KERNEL, disk->queue->node); + if (iars) + iars->nr_ia_ranges = nr_ia_ranges; + return iars; +} +EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges); + +/** + * disk_set_independent_access_ranges - Set a disk independent access ranges + * @disk: target disk + * @iars: independent access ranges structure + * + * Set the independent access ranges information of the request queue + * of @disk to @iars. If @iars is NULL and the independent access ranges + * structure already set is cleared. If there are no differences between + * @iars and the independent access ranges structure already set, @iars + * is freed. + */ +void disk_set_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars) +{ + struct request_queue *q = disk->queue; + + if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) { + kfree(iars); + iars = NULL; + } + + mutex_lock(&q->sysfs_dir_lock); + mutex_lock(&q->sysfs_lock); + + if (iars) { + if (!disk_check_ia_ranges(disk, iars)) { + kfree(iars); + iars = NULL; + goto reg; + } + + if (!disk_ia_ranges_changed(disk, iars)) { + kfree(iars); + goto unlock; + } + } + + /* + * This may be called for a registered queue. E.g. during a device + * revalidation. If that is the case, we need to unregister the old + * set of independent access ranges and register the new set. If the + * queue is not registered, registration of the device request queue + * will register the independent access ranges, so only swap in the + * new set and free the old one. + */ +reg: + if (blk_queue_registered(q)) { + disk_register_independent_access_ranges(disk, iars); + } else { + swap(q->ia_ranges, iars); + kfree(iars); + } + +unlock: + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); +} +EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 16d5d5338392..d670d54e5f7a 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -6,7 +6,7 @@ * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/backing-dev.h> #include <linux/mempool.h> #include <linux/bio.h> @@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->ksm) { + if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - blk_ksm_unregister(disk->queue); + blk_crypto_unregister(disk->queue); } #endif } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index b3880e4ba22a..a5b37cc65b17 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3165,12 +3165,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (IS_ERR(bdev)) return PTR_ERR(bdev); - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(bdev_get_queue(bdev)); if (!ioc) { - ret = blk_iocost_init(bdev->bd_disk->queue); + ret = blk_iocost_init(bdev_get_queue(bdev)); if (ret) goto err; - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(bdev_get_queue(bdev)); } spin_lock_irq(&ioc->lock); @@ -3332,12 +3332,12 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, if (IS_ERR(bdev)) return PTR_ERR(bdev); - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(bdev_get_queue(bdev)); if (!ioc) { - ret = blk_iocost_init(bdev->bd_disk->queue); + ret = blk_iocost_init(bdev_get_queue(bdev)); if (ret) goto err; - ioc = q_to_ioc(bdev->bd_disk->queue); + ioc = q_to_ioc(bdev_get_queue(bdev)); } spin_lock_irq(&ioc->lock); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c0545f9da549..6593c7123b97 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -74,6 +74,7 @@ #include <linux/sched/signal.h> #include <trace/events/block.h> #include <linux/blk-mq.h> +#include <linux/blk-cgroup.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk.h" diff --git a/block/blk-merge.c b/block/blk-merge.c index 7a5c81c02c80..df69f4bb7717 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -6,12 +6,45 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/scatterlist.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" +#include "blk-throttle.h" + +static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) +{ + *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); +} + +static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) +{ + struct bvec_iter iter = bio->bi_iter; + int idx; + + bio_get_first_bvec(bio, bv); + if (bv->bv_len == bio->bi_iter.bi_size) + return; /* this bio only has a single bvec */ + + bio_advance_iter(bio, &iter, iter.bi_size); + + if (!iter.bi_bvec_done) + idx = iter.bi_idx - 1; + else /* in the middle of bvec */ + idx = iter.bi_idx; + + *bv = bio->bi_io_vec[idx]; + + /* + * iter.bi_bvec_done records actual length of the last bvec + * if this bio ends in the middle of one io vector + */ + if (iter.bi_bvec_done) + bv->bv_len = iter.bi_bvec_done; +} static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) @@ -285,13 +318,13 @@ split: * iopoll in direct IO routine. Given performance gain of iopoll for * big IO can be trival, disable iopoll when split needed. */ - bio_clear_hipri(bio); - + bio_clear_polled(bio); return bio_split(bio, sectors, GFP_NOIO, bs); } /** * __blk_queue_split - split a bio and submit the second half + * @q: [in] request_queue new bio is being queued at * @bio: [in, out] bio to be split * @nr_segs: [out] number of segments in the first bio * @@ -302,9 +335,9 @@ split: * of the caller to ensure that q->bio_split is only released after processing * of the split bio has finished. */ -void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) +void __blk_queue_split(struct request_queue *q, struct bio **bio, + unsigned int *nr_segs) { - struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -321,21 +354,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) nr_segs); break; default: - /* - * All drivers must accept single-segments bios that are <= - * PAGE_SIZE. This is a quick and dirty check that relies on - * the fact that bi_io_vec[0] is always valid if a bio has data. - * The check might lead to occasional false negatives when bios - * are cloned, but compared to the performance impact of cloned - * bios themselves the loop below doesn't matter anyway. - */ - if (!q->limits.chunk_sectors && - (*bio)->bi_vcnt == 1 && - ((*bio)->bi_io_vec[0].bv_len + - (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { - *nr_segs = 1; - break; - } split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; } @@ -365,9 +383,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) */ void blk_queue_split(struct bio **bio) { + struct request_queue *q = bdev_get_queue((*bio)->bi_bdev); unsigned int nr_segs; - __blk_queue_split(bio, &nr_segs); + if (blk_may_split(q, *bio)) + __blk_queue_split(q, bio, &nr_segs); } EXPORT_SYMBOL(blk_queue_split); @@ -558,6 +578,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) return queue_max_segments(rq->q); } +static inline unsigned int blk_rq_get_max_sectors(struct request *rq, + sector_t offset) +{ + struct request_queue *q = rq->q; + + if (blk_rq_is_passthrough(rq)) + return q->limits.max_hw_sectors; + + if (!q->limits.chunk_sectors || + req_op(rq) == REQ_OP_DISCARD || + req_op(rq) == REQ_OP_SECURE_ERASE) + return blk_queue_get_max_sectors(q, req_op(rq)); + + return min(blk_max_size_offset(q, offset, 0), + blk_queue_get_max_sectors(q, req_op(rq))); +} + static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { @@ -718,6 +755,13 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) +{ + if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b)) + return true; + return false; +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -1023,12 +1067,11 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @nr_segs: number of segments in @bio - * @same_queue_rq: pointer to &struct request that gets filled in when - * another request associated with @q is found on the plug list - * (optional, may be %NULL) + * @same_queue_rq: output value, will be true if there's an existing request + * from the passed in @q already in the plug list * - * Determine whether @bio being queued on @q can be merged with a request - * on %current's plugged list. Returns %true if merge was successful, + * Determine whether @bio being queued on @q can be merged with the previous + * request on %current's plugged list. Returns %true if merge was successful, * otherwise %false. * * Plugging coalesces IOs from the same issuer for the same purpose without @@ -1041,36 +1084,26 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq) + unsigned int nr_segs, bool *same_queue_rq) { struct blk_plug *plug; struct request *rq; - struct list_head *plug_list; plug = blk_mq_plug(q, bio); - if (!plug) + if (!plug || rq_list_empty(plug->mq_list)) return false; - plug_list = &plug->mq_list; - - list_for_each_entry_reverse(rq, plug_list, queuelist) { - if (rq->q == q && same_queue_rq) { - /* - * Only blk-mq multiple hardware queues case checks the - * rq in the same queue, there should be only one such - * rq in a queue - **/ - *same_queue_rq = rq; - } - - if (rq->q != q) - continue; - - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) - return true; + /* check the previously added entry for a quick merge attempt */ + rq = rq_list_peek(&plug->mq_list); + if (rq->q == q) { + /* + * Only blk-mq multiple hardware queues case checks the rq in + * the same queue, there should be only one such rq in a queue + */ + *same_queue_rq = true; } - + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK) + return true; return false; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4b66d2776eda..f5076c173477 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -124,11 +124,11 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), - QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), + QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(NOWAIT), }; #undef QUEUE_FLAG_NAME @@ -286,7 +286,7 @@ static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(NOUNMAP), - CMD_FLAG_NAME(HIPRI), + CMD_FLAG_NAME(POLLED), }; #undef CMD_FLAG_NAME @@ -452,11 +452,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, atomic_read(&tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); - sbitmap_queue_show(tags->bitmap_tags, m); + sbitmap_queue_show(&tags->bitmap_tags, m); if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); - sbitmap_queue_show(tags->breserved_tags, m); + sbitmap_queue_show(&tags->breserved_tags, m); } } @@ -487,7 +487,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->tags) - sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m); + sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: @@ -521,77 +521,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->sched_tags) - sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m); + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: return res; } -static int hctx_io_poll_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "considered=%lu\n", hctx->poll_considered); - seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); - seq_printf(m, "success=%lu\n", hctx->poll_success); - return 0; -} - -static ssize_t hctx_io_poll_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; - return count; -} - -static int hctx_dispatched_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - int i; - - seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { - unsigned int d = 1U << (i - 1); - - seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); - } - - seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); - return 0; -} - -static ssize_t hctx_dispatched_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - int i; - - for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) - hctx->dispatched[i] = 0; - return count; -} - -static int hctx_queued_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->queued); - return 0; -} - -static ssize_t hctx_queued_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->queued = 0; - return count; -} - static int hctx_run_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -613,7 +549,7 @@ static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); return 0; } @@ -662,57 +598,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); -static int ctx_dispatched_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); - return 0; -} - -static ssize_t ctx_dispatched_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; - return count; -} - -static int ctx_merged_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu\n", ctx->rq_merged); - return 0; -} - -static ssize_t ctx_merged_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_merged = 0; - return count; -} - -static int ctx_completed_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); - return 0; -} - -static ssize_t ctx_completed_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_completed[0] = ctx->rq_completed[1] = 0; - return count; -} - static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; @@ -788,9 +673,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write}, - {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write}, - {"queued", 0600, hctx_queued_show, hctx_queued_write}, {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, @@ -802,9 +684,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, - {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, - {"merged", 0600, ctx_merged_show, ctx_merged_write}, - {"completed", 0600, ctx_completed_show, ctx_completed_write}, {}, }; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0f006cabfd91..c62b966dfaba 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -57,10 +57,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) +void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* @@ -363,7 +361,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } } -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; @@ -389,13 +387,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, * potentially merge with. Currently includes a hand-wavy stop * count of 8, to not spend too much time checking for merges. */ - if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { - ctx->rq_merged++; + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) ret = true; - } spin_unlock(&ctx->lock); - return ret; } @@ -515,83 +510,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_put(&q->q_usage_counter); } -static int blk_mq_sched_alloc_tags(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) { - struct blk_mq_tag_set *set = q->tag_set; - int ret; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + hctx->sched_tags = q->sched_shared_tags; + return 0; + } + + hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, + q->nr_requests); - hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM; + return 0; +} - ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) { - blk_mq_free_rq_map(hctx->sched_tags, set->flags); - hctx->sched_tags = NULL; - } - - return ret; +static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) +{ + blk_mq_free_rq_map(queue->sched_shared_tags); + queue->sched_shared_tags = NULL; } /* called in queue's release handler, tagset has gone away */ -static void blk_mq_sched_tags_teardown(struct request_queue *q) +static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); + if (!blk_mq_is_shared_tags(flags)) + blk_mq_free_rq_map(hctx->sched_tags); hctx->sched_tags = NULL; } } + + if (blk_mq_is_shared_tags(flags)) + blk_mq_exit_sched_shared_tags(q); } -static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) +static int blk_mq_init_sched_shared_tags(struct request_queue *queue) { struct blk_mq_tag_set *set = queue->tag_set; - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - struct blk_mq_hw_ctx *hctx; - int ret, i; /* * Set initial depth at max so that we don't need to reallocate for * updating nr_requests. */ - ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, - &queue->sched_breserved_tags, - MAX_SCHED_RQ, set->reserved_tags, - set->numa_node, alloc_policy); - if (ret) - return ret; - - queue_for_each_hw_ctx(queue, hctx, i) { - hctx->sched_tags->bitmap_tags = - &queue->sched_bitmap_tags; - hctx->sched_tags->breserved_tags = - &queue->sched_breserved_tags; - } + queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!queue->sched_shared_tags) + return -ENOMEM; - sbitmap_queue_resize(&queue->sched_bitmap_tags, - queue->nr_requests - set->reserved_tags); + blk_mq_tag_update_sched_shared_tags(queue); return 0; } -static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) -{ - sbitmap_queue_free(&queue->sched_bitmap_tags); - sbitmap_queue_free(&queue->sched_breserved_tags); -} - int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { + unsigned int i, flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; - unsigned int i; int ret; if (!e) { @@ -606,23 +589,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) * Additionally, this is a per-hw queue depth. */ q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_MAX_RQ); + BLKDEV_DEFAULT_RQ); - queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_tags(q, hctx, i); + if (blk_mq_is_shared_tags(flags)) { + ret = blk_mq_init_sched_shared_tags(q); if (ret) - goto err_free_tags; + return ret; } - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { - ret = blk_mq_init_sched_shared_sbitmap(q); + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (ret) - goto err_free_tags; + goto err_free_map_and_rqs; } ret = e->ops.init_sched(q, e); if (ret) - goto err_free_sbitmap; + goto err_free_map_and_rqs; blk_mq_debugfs_register_sched(q); @@ -631,7 +614,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -642,12 +625,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; -err_free_sbitmap: - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) - blk_mq_exit_sched_shared_sbitmap(q); -err_free_tags: - blk_mq_sched_free_requests(q); - blk_mq_sched_tags_teardown(q); +err_free_map_and_rqs: + blk_mq_sched_free_rqs(q); + blk_mq_sched_tags_teardown(q, flags); + q->elevator = NULL; return ret; } @@ -656,14 +637,20 @@ err_free_tags: * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ -void blk_mq_sched_free_requests(struct request_queue *q) +void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) - blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, + BLK_MQ_NO_HCTX_IDX); + } else { + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, + hctx->sched_tags, i); + } } } @@ -684,8 +671,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); - blk_mq_sched_tags_teardown(q); - if (blk_mq_is_sbitmap_shared(flags)) - blk_mq_exit_sched_shared_sbitmap(q); + blk_mq_sched_tags_teardown(q, flags); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 5246ae040704..25d1034952b6 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -2,21 +2,22 @@ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H +#include "elevator.h" #include "blk-mq.h" #include "blk-mq-tag.h" -#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) +#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); +void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async); @@ -28,45 +29,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); -void blk_mq_sched_free_requests(struct request_queue *q); +void blk_mq_sched_free_rqs(struct request_queue *q); -static inline bool -blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs) +static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { - if (blk_queue_nomerges(q) || !bio_mergeable(bio)) - return false; + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + __blk_mq_sched_restart(hctx); +} - return __blk_mq_sched_bio_merge(q, bio, nr_segs); +static inline bool bio_mergeable(struct bio *bio) +{ + return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.allow_merge) - return e->type->ops.allow_merge(q, rq, bio); + if (rq->rq_flags & RQF_ELV) { + struct elevator_queue *e = q->elevator; + if (e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); + } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { - struct elevator_queue *e = rq->q->elevator; + if (rq->rq_flags & RQF_ELV) { + struct elevator_queue *e = rq->q->elevator; - if (e && e->type->ops.completed_request) - e->type->ops.completed_request(rq, now); + if (e->type->ops.completed_request) + e->type->ops.completed_request(rq, now); + } } static inline void blk_mq_sched_requeue_request(struct request *rq) { - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; + if (rq->rq_flags & RQF_ELV) { + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; - if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request) - e->type->ops.requeue_request(rq); + if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request) + e->type->ops.requeue_request(rq); + } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ff5caeb82542..995336abee33 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -24,13 +24,12 @@ */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - atomic_inc(&set->active_queues_shared_sbitmap); + atomic_inc(&hctx->tags->active_queues); } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) @@ -45,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - sbitmap_queue_wake_all(tags->bitmap_tags); + sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) - sbitmap_queue_wake_all(tags->breserved_tags); + sbitmap_queue_wake_all(&tags->breserved_tags); } /* @@ -57,20 +56,20 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; - struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set; - if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; - atomic_dec(&set->active_queues_shared_sbitmap); } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; - atomic_dec(&tags->active_queues); } + atomic_dec(&tags->active_queues); + blk_mq_tag_wakeup_all(tags, false); } @@ -87,6 +86,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, return __sbitmap_queue_get(bt); } +unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, + unsigned int *offset) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct sbitmap_queue *bt = &tags->bitmap_tags; + unsigned long ret; + + if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || + data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + return 0; + ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); + *offset += tags->nr_reserved_tags; + return ret; +} + unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); @@ -101,10 +115,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } - bt = tags->breserved_tags; + bt = &tags->breserved_tags; tag_offset = 0; } else { - bt = tags->bitmap_tags; + bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } @@ -150,9 +164,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) - bt = tags->breserved_tags; + bt = &tags->breserved_tags; else - bt = tags->bitmap_tags; + bt = &tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on @@ -186,13 +200,19 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) +{ + sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, + tag_array, nr_tags); +} + struct bt_iter_data { struct blk_mq_hw_ctx *hctx; busy_iter_fn *fn; @@ -340,9 +360,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) - bt_tags_for_each(tags, tags->breserved_tags, fn, priv, + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); - bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); } /** @@ -379,9 +399,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { - int i; + unsigned int flags = tagset->flags; + int i, nr_tags; + + nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; - for (i = 0; i < tagset->nr_hw_queues; i++) { + for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); @@ -459,8 +482,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, continue; if (tags->nr_reserved_tags) - bt_for_each(hctx, tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } blk_queue_exit(q); } @@ -492,56 +515,10 @@ free_bitmap_tags: return -ENOMEM; } -static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) -{ - int ret; - - ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, - &tags->__breserved_tags, - tags->nr_tags, tags->nr_reserved_tags, - node, alloc_policy); - if (ret) - return ret; - - tags->bitmap_tags = &tags->__bitmap_tags; - tags->breserved_tags = &tags->__breserved_tags; - - return 0; -} - -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) -{ - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - int i, ret; - - ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, - set->queue_depth, set->reserved_tags, - set->numa_node, alloc_policy); - if (ret) - return ret; - - for (i = 0; i < set->nr_hw_queues; i++) { - struct blk_mq_tags *tags = set->tags[i]; - - tags->bitmap_tags = &set->__bitmap_tags; - tags->breserved_tags = &set->__breserved_tags; - } - - return 0; -} - -void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) -{ - sbitmap_queue_free(&set->__bitmap_tags); - sbitmap_queue_free(&set->__breserved_tags); -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, - int node, unsigned int flags) + int node, int alloc_policy) { - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -557,22 +534,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); - if (blk_mq_is_sbitmap_shared(flags)) - return tags; - - if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { + if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, + total_tags, reserved_tags, node, + alloc_policy) < 0) { kfree(tags); return NULL; } return tags; } -void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) +void blk_mq_free_tags(struct blk_mq_tags *tags) { - if (!blk_mq_is_sbitmap_shared(flags)) { - sbitmap_queue_free(tags->bitmap_tags); - sbitmap_queue_free(tags->breserved_tags); - } + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); kfree(tags); } @@ -592,7 +566,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; - bool ret; if (!can_grow) return -EINVAL; @@ -604,34 +577,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > MAX_SCHED_RQ) return -EINVAL; - new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, set->flags); + /* + * Only the sbitmap needs resizing since we allocated the max + * initially. + */ + if (blk_mq_is_shared_tags(set->flags)) + return 0; + + new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; - ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); - if (ret) { - blk_mq_free_rq_map(new, set->flags); - return -ENOMEM; - } - blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, set->flags); + blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); *tagsptr = new; } else { /* * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ - sbitmap_queue_resize(tags->bitmap_tags, + sbitmap_queue_resize(&tags->bitmap_tags, tdepth - tags->nr_reserved_tags); } return 0; } -void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) +void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) +{ + struct blk_mq_tags *tags = set->shared_tags; + + sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); +} + +void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) { - sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); + sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, + q->nr_requests - q->tag_set->reserved_tags); } /** diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 8ed55af08427..df787b5a23bd 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -2,52 +2,30 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -/* - * Tag address space map. - */ -struct blk_mq_tags { - unsigned int nr_tags; - unsigned int nr_reserved_tags; - - atomic_t active_queues; - - struct sbitmap_queue *bitmap_tags; - struct sbitmap_queue *breserved_tags; - - struct sbitmap_queue __bitmap_tags; - struct sbitmap_queue __breserved_tags; - - struct request **rqs; - struct request **static_rqs; - struct list_head page_list; - - /* - * used to clear request reference in rqs[] before freeing one - * request pool - */ - spinlock_t lock; -}; +struct blk_mq_alloc_data; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, - int node, unsigned int flags); -extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); + int node, int alloc_policy); +extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy); -extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); -extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, + unsigned int *offset); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); -extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, +extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); +extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, diff --git a/block/blk-mq.c b/block/blk-mq.c index 108a352051be..07eb1412760b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -10,14 +10,15 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> +#include <linux/interrupt.h> #include <linux/llist.h> -#include <linux/list_sort.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/sysctl.h> @@ -63,6 +64,32 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) return bucket; } +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) + +static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, + blk_qc_t qc) +{ + return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; +} + +static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, + blk_qc_t qc) +{ + unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); + + if (qc & BLK_QC_T_INTERNAL) + return blk_mq_tag_to_rq(hctx->sched_tags, tag); + return blk_mq_tag_to_rq(hctx->tags, tag); +} + +static inline blk_qc_t blk_rq_to_qc(struct request *rq) +{ + return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | + (rq->tag != -1 ? + rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); +} + /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. @@ -188,9 +215,11 @@ void blk_mq_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void blk_mq_unfreeze_queue(struct request_queue *q) +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { mutex_lock(&q->mq_freeze_lock); + if (force_atomic) + q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { @@ -199,6 +228,11 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } mutex_unlock(&q->mq_freeze_lock); } + +void blk_mq_unfreeze_queue(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* @@ -207,7 +241,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + if (!q->quiesce_depth++) + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); @@ -248,10 +287,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + bool run_queue = false; + + spin_lock_irqsave(&q->queue_lock, flags); + if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { + ; + } else if (!--q->quiesce_depth) { + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + run_queue = true; + } + spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + if (run_queue) + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); @@ -265,74 +315,67 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } -/* - * Only need start/end time stamping if we have iostat or - * blk stats enabled, or using an IO scheduler. - */ -static inline bool blk_mq_need_time_stamp(struct request *rq) -{ - return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; -} - static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, u64 alloc_time_ns) + struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) { - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct blk_mq_ctx *ctx = data->ctx; + struct blk_mq_hw_ctx *hctx = data->hctx; + struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; - if (data->q->elevator) { - rq->tag = BLK_MQ_NO_TAG; - rq->internal_tag = tag; - } else { + rq->q = q; + rq->mq_ctx = ctx; + rq->mq_hctx = hctx; + rq->cmd_flags = data->cmd_flags; + + if (data->flags & BLK_MQ_REQ_PM) + data->rq_flags |= RQF_PM; + if (blk_queue_io_stat(q)) + data->rq_flags |= RQF_IO_STAT; + rq->rq_flags = data->rq_flags; + + if (!(data->rq_flags & RQF_ELV)) { rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; + } else { + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = tag; } + rq->timeout = 0; - /* csd/requeue_work/fifo_time is initialized before use */ - rq->q = data->q; - rq->mq_ctx = data->ctx; - rq->mq_hctx = data->hctx; - rq->rq_flags = 0; - rq->cmd_flags = data->cmd_flags; - if (data->flags & BLK_MQ_REQ_PM) - rq->rq_flags |= RQF_PM; - if (blk_queue_io_stat(data->q)) - rq->rq_flags |= RQF_IO_STAT; - INIT_LIST_HEAD(&rq->queuelist); - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; rq->rq_disk = NULL; rq->part = NULL; #ifdef CONFIG_BLK_RQ_ALLOC_TIME rq->alloc_time_ns = alloc_time_ns; #endif - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); - else - rq->start_time_ns = 0; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif - blk_crypto_rq_set_defaults(rq); - /* tag was already set */ - WRITE_ONCE(rq->deadline, 0); - - rq->timeout = 0; - rq->end_io = NULL; rq->end_io_data = NULL; - data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; + blk_crypto_rq_set_defaults(rq); + INIT_LIST_HEAD(&rq->queuelist); + /* tag was already set */ + WRITE_ONCE(rq->deadline, 0); refcount_set(&rq->ref, 1); - if (!op_is_flush(data->cmd_flags)) { + if (rq->rq_flags & RQF_ELV) { struct elevator_queue *e = data->q->elevator; rq->elv.icq = NULL; - if (e && e->type->ops.prepare_request) { + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + + if (!op_is_flush(data->cmd_flags) && + e->type->ops.prepare_request) { if (e->type->icq_cache) blk_mq_sched_assign_ioc(rq); @@ -341,15 +384,44 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } } - data->hctx->queued++; return rq; } -static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) +static inline struct request * +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, + u64 alloc_time_ns) +{ + unsigned int tag, tag_offset; + struct blk_mq_tags *tags; + struct request *rq; + unsigned long tag_mask; + int i, nr = 0; + + tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); + if (unlikely(!tag_mask)) + return NULL; + + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + prefetch(tags->static_rqs[tag]); + tag = tag_offset + i; + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); + rq_list_add(data->cached_rq, rq); + } + data->nr_tags -= nr; + + return rq_list_pop(data->cached_rq); +} + +static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; struct elevator_queue *e = q->elevator; u64 alloc_time_ns = 0; + struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ @@ -379,6 +451,16 @@ retry: blk_mq_tag_busy(data->hctx); /* + * Try batched alloc if we want more than 1 tag. + */ + if (data->nr_tags > 1) { + rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); + if (rq) + return rq; + data->nr_tags = 1; + } + + /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now. @@ -387,16 +469,18 @@ retry: if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; - /* - * Give up the CPU and sleep for a random short time to ensure - * that thread using a realtime scheduling class are migrated - * off the CPU, and thus off the hctx that is going away. + * Give up the CPU and sleep for a random short time to + * ensure that thread using a realtime scheduling class + * are migrated off the CPU, and thus off the hctx that + * is going away. */ msleep(3); goto retry; } - return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); + + return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, + alloc_time_ns); } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, @@ -406,6 +490,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, .q = q, .flags = flags, .cmd_flags = op, + .rq_flags = q->elevator ? RQF_ELV : 0, + .nr_tags = 1, }; struct request *rq; int ret; @@ -414,7 +500,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (ret) return ERR_PTR(ret); - rq = __blk_mq_alloc_request(&data); + rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; rq->__data_len = 0; @@ -434,6 +520,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, .q = q, .flags = flags, .cmd_flags = op, + .rq_flags = q->elevator ? RQF_ELV : 0, + .nr_tags = 1, }; u64 alloc_time_ns = 0; unsigned int cpu; @@ -478,7 +566,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); + return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, + alloc_time_ns); out_queue_exit: blk_queue_exit(q); @@ -507,12 +596,12 @@ static void __blk_mq_free_request(struct request *rq) void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.finish_request) + struct elevator_queue *e = q->elevator; + + if (e->type->ops.finish_request) e->type->ops.finish_request(rq); if (rq->elv.icq) { put_io_context(rq->elv.icq->ioc); @@ -520,7 +609,6 @@ void blk_mq_free_request(struct request *rq) } } - ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) __blk_mq_dec_active_requests(hctx); @@ -535,21 +623,173 @@ void blk_mq_free_request(struct request *rq) } EXPORT_SYMBOL_GPL(blk_mq_free_request); -inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +void blk_mq_free_plug_rqs(struct blk_plug *plug) { - u64 now = 0; + struct request *rq; - if (blk_mq_need_time_stamp(rq)) - now = ktime_get_ns(); + while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) { + percpu_ref_get(&rq->q->q_usage_counter); + blk_mq_free_request(rq); + } +} + +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, blk_status_t error) +{ + if (unlikely(error)) { + bio->bi_status = error; + } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + */ + if (bio->bi_iter.bi_size != nbytes) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_iter.bi_sector = rq->__sector; + } + + bio_advance(bio, nbytes); + + if (unlikely(rq->rq_flags & RQF_QUIET)) + bio_set_flag(bio, BIO_QUIET); + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + bio_endio(bio); +} + +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + if (req->part && blk_do_io_stat(req)) { + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); + part_stat_unlock(); + } +} + +/** + * blk_update_request - Complete multiple bytes without completing the request + * @req: the request being processed + * @error: block status code + * @nr_bytes: number of bytes to complete for @req + * + * Description: + * Ends I/O on a number of bytes attached to @req, but doesn't complete + * the request structure even if @req doesn't have leftover. + * If @req has leftover, sets it up for the next range of segments. + * + * Passing the result of blk_rq_bytes() as @nr_bytes guarantees + * %false return from this function. + * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function + * except in the consistency check at the end of this function. + * + * Return: + * %false - this request doesn't have any more data + * %true - this request has more data + **/ +bool blk_update_request(struct request *req, blk_status_t error, + unsigned int nr_bytes) +{ + int total_bytes; + + trace_block_rq_complete(req, error, nr_bytes); + + if (!req->bio) + return false; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + + if (unlikely(error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET))) + blk_print_req_error(req, error); + + blk_account_io_completion(req, nr_bytes); + + total_bytes = 0; + while (req->bio) { + struct bio *bio = req->bio; + unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); + + if (bio_bytes == bio->bi_iter.bi_size) + req->bio = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + req_bio_endio(req, bio, bio_bytes, error); + + total_bytes += bio_bytes; + nr_bytes -= bio_bytes; + + if (!nr_bytes) + break; + } + + /* + * completely done + */ + if (!req->bio) { + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->__data_len = 0; + return false; + } + + req->__data_len -= total_bytes; + + /* update sector only for requests with clear definition of sector */ + if (!blk_rq_is_passthrough(req)) + req->__sector += total_bytes >> 9; + + /* mixed attributes always follow the first bio */ + if (req->rq_flags & RQF_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; + } + + if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { + /* + * If total number of sectors is less than the first segment + * size, something has gone terribly wrong. + */ + if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { + blk_dump_rq_flags(req, "request botched"); + req->__data_len = blk_rq_cur_bytes(req); + } + /* recalculate the number of segments */ + req->nr_phys_segments = blk_recalc_rq_segments(req); + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_update_request); + +static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) +{ if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); blk_stat_add(rq, now); } blk_mq_sched_completed_request(rq, now); - blk_account_io_done(rq, now); +} + +inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +{ + if (blk_mq_need_time_stamp(rq)) + __blk_mq_end_request_acct(rq, ktime_get_ns()); if (rq->end_io) { rq_qos_done(rq->q, rq); @@ -568,6 +808,57 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); +#define TAG_COMP_BATCH 32 + +static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, + int *tag_array, int nr_tags) +{ + struct request_queue *q = hctx->queue; + + blk_mq_put_tags(hctx->tags, tag_array, nr_tags); + percpu_ref_put_many(&q->q_usage_counter, nr_tags); +} + +void blk_mq_end_request_batch(struct io_comp_batch *iob) +{ + int tags[TAG_COMP_BATCH], nr_tags = 0; + struct blk_mq_hw_ctx *cur_hctx = NULL; + struct request *rq; + u64 now = 0; + + if (iob->need_ts) + now = ktime_get_ns(); + + while ((rq = rq_list_pop(&iob->req_list)) != NULL) { + prefetch(rq->bio); + prefetch(rq->rq_next); + + blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq)); + if (iob->need_ts) + __blk_mq_end_request_acct(rq, now); + + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + if (!refcount_dec_and_test(&rq->ref)) + continue; + + blk_crypto_free_request(rq); + blk_pm_mark_last_busy(rq); + rq_qos_done(rq->q, rq); + + if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { + if (cur_hctx) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); + nr_tags = 0; + cur_hctx = rq->mq_hctx; + } + tags[nr_tags++] = rq->tag; + } + + if (nr_tags) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); +} +EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); + static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); @@ -651,7 +942,7 @@ bool blk_mq_complete_request_remote(struct request *rq) * For a polled request, always complete locallly, it's pointless * to redirect the completion. */ - if (rq->cmd_flags & REQ_HIPRI) + if (rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { @@ -716,7 +1007,14 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - rq->io_start_time_ns = ktime_get_ns(); + u64 start_time; +#ifdef CONFIG_BLK_CGROUP + if (rq->bio) + start_time = bio_issue_time(&rq->bio->bi_issue); + else +#endif + start_time = ktime_get_ns(); + rq->io_start_time_ns = start_time; rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -731,6 +1029,8 @@ void blk_mq_start_request(struct request *rq) if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); #endif + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) + WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); } EXPORT_SYMBOL(blk_mq_start_request); @@ -756,7 +1056,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(!list_empty(&rq->queuelist)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -837,17 +1136,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) -{ - if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; - } - - return NULL; -} -EXPORT_SYMBOL(blk_mq_tag_to_rq); - static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -1052,24 +1340,16 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, return data.rq; } -static inline unsigned int queued_to_index(unsigned int queued) +static bool __blk_mq_alloc_driver_tag(struct request *rq) { - if (!queued) - return 0; - - return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); -} - -static bool __blk_mq_get_driver_tag(struct request *rq) -{ - struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; + struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = rq->mq_hctx->tags->breserved_tags; + bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) @@ -1084,11 +1364,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -bool blk_mq_get_driver_tag(struct request *rq) +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && @@ -1112,7 +1390,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, struct sbitmap_queue *sbq; list_del_init(&wait->entry); - sbq = hctx->tags->bitmap_tags; + sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); @@ -1130,7 +1408,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; @@ -1318,6 +1596,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, int errors, queued; blk_status_t ret = BLK_STS_OK; LIST_HEAD(zone_list); + bool needs_resource = false; if (list_empty(list)) return false; @@ -1363,6 +1642,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, queued++; break; case BLK_STS_RESOURCE: + needs_resource = true; + fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; @@ -1373,6 +1654,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); + needs_resource = true; break; default: errors++; @@ -1383,8 +1665,6 @@ out: if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); - hctx->dispatched[queued_to_index(queued)]++; - /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ @@ -1399,7 +1679,6 @@ out: /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); - bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; if (nr_budgets) blk_mq_release_budgets(q, list); @@ -1440,14 +1719,16 @@ out: * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do - * similar if we couldn't get budget and SCHED_RESTART is set. + * similar if we couldn't get budget or couldn't lock a zone + * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); + if (prep == PREP_DISPATCH_NO_BUDGET) + needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE || - no_budget_avail)) + else if (needs_restart && needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); @@ -1887,54 +2168,106 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, spin_unlock(&ctx->lock); } -static int plug_rq_cmp(void *priv, const struct list_head *a, - const struct list_head *b) +static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, + bool from_schedule) +{ + if (hctx->queue->mq_ops->commit_rqs) { + trace_block_unplug(hctx->queue, *queued, !from_schedule); + hctx->queue->mq_ops->commit_rqs(hctx); + } + *queued = 0; +} + +static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) { - struct request *rqa = container_of(a, struct request, queuelist); - struct request *rqb = container_of(b, struct request, queuelist); + struct blk_mq_hw_ctx *hctx = NULL; + struct request *rq; + int queued = 0; + int errors = 0; + + while ((rq = rq_list_pop(&plug->mq_list))) { + bool last = rq_list_empty(plug->mq_list); + blk_status_t ret; - if (rqa->mq_ctx != rqb->mq_ctx) - return rqa->mq_ctx > rqb->mq_ctx; - if (rqa->mq_hctx != rqb->mq_hctx) - return rqa->mq_hctx > rqb->mq_hctx; + if (hctx != rq->mq_hctx) { + if (hctx) + blk_mq_commit_rqs(hctx, &queued, from_schedule); + hctx = rq->mq_hctx; + } - return blk_rq_pos(rqa) > blk_rq_pos(rqb); + ret = blk_mq_request_issue_directly(rq, last); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, last); + blk_mq_commit_rqs(hctx, &queued, from_schedule); + return; + default: + blk_mq_end_request(rq, ret); + errors++; + break; + } + } + + /* + * If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if (errors) + blk_mq_commit_rqs(hctx, &queued, from_schedule); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { + struct blk_mq_hw_ctx *this_hctx; + struct blk_mq_ctx *this_ctx; + unsigned int depth; LIST_HEAD(list); - if (list_empty(&plug->mq_list)) + if (rq_list_empty(plug->mq_list)) return; - list_splice_init(&plug->mq_list, &list); - - if (plug->rq_count > 2 && plug->multiple_queues) - list_sort(NULL, &list, plug_rq_cmp); - plug->rq_count = 0; + if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { + blk_mq_plug_issue_direct(plug, from_schedule); + if (rq_list_empty(plug->mq_list)) + return; + } + + this_hctx = NULL; + this_ctx = NULL; + depth = 0; do { - struct list_head rq_list; - struct request *rq, *head_rq = list_entry_rq(list.next); - struct list_head *pos = &head_rq->queuelist; /* skip first */ - struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; - struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; - unsigned int depth = 1; - - list_for_each_continue(pos, &list) { - rq = list_entry_rq(pos); - BUG_ON(!rq->q); - if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) - break; - depth++; + struct request *rq; + + rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + trace_block_unplug(this_hctx->queue, depth, + !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &list, from_schedule); + depth = 0; + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } - list_cut_before(&rq_list, &list, pos); - trace_block_unplug(head_rq->q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, + list_add(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + if (!list_empty(&list)) { + trace_block_unplug(this_hctx->queue, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_schedule); - } while(!list_empty(&list)); + } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, @@ -1957,19 +2290,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool last) + struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; - blk_qc_t new_cookie; blk_status_t ret; - new_cookie = request_to_qc_t(hctx, rq); - /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we @@ -1979,7 +2308,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); - *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: @@ -1988,7 +2316,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, break; default: blk_mq_update_dispatch_busy(hctx, false); - *cookie = BLK_QC_T_NONE; break; } @@ -1997,7 +2324,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie, bool bypass_insert, bool last) { struct request_queue *q = rq->q; @@ -2017,7 +2343,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - if (q->elevator && !bypass_insert) + if ((rq->rq_flags & RQF_ELV) && !bypass_insert) goto insert; budget_token = blk_mq_get_dispatch_budget(q); @@ -2031,7 +2357,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - return __blk_mq_issue_directly(hctx, rq, cookie, last); + return __blk_mq_issue_directly(hctx, rq, last); insert: if (bypass_insert) return BLK_STS_RESOURCE; @@ -2045,7 +2371,6 @@ insert: * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. - * @cookie: Request queue cookie. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so @@ -2053,7 +2378,7 @@ insert: * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) + struct request *rq) { blk_status_t ret; int srcu_idx; @@ -2062,7 +2387,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); + ret = __blk_mq_try_issue_directly(hctx, rq, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) @@ -2075,11 +2400,10 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { blk_status_t ret; int srcu_idx; - blk_qc_t unused_cookie; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); + ret = __blk_mq_try_issue_directly(hctx, rq, true, last); hctx_unlock(hctx, srcu_idx); return ret; @@ -2123,27 +2447,28 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { - list_add_tail(&rq->queuelist, &plug->mq_list); - plug->rq_count++; - if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { - struct request *tmp; + if (!plug->multiple_queues) { + struct request *nxt = rq_list_peek(&plug->mq_list); - tmp = list_first_entry(&plug->mq_list, struct request, - queuelist); - if (tmp->q != rq->q) + if (nxt && nxt->q != rq->q) plug->multiple_queues = true; } + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; } /* - * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests. */ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { if (plug->multiple_queues) - return BLK_MAX_REQUEST_COUNT * 4; + return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT; } @@ -2159,57 +2484,63 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) * * It will not queue the request if there is an error with the bio, or at the * request creation. - * - * Returns: Request queue cookie. */ -blk_qc_t blk_mq_submit_bio(struct bio *bio) +void blk_mq_submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { - .q = q, - }; struct request *rq; struct blk_plug *plug; - struct request *same_queue_rq = NULL; - unsigned int nr_segs; - blk_qc_t cookie; + bool same_queue_rq = false; + unsigned int nr_segs = 1; blk_status_t ret; - bool hipri; blk_queue_bounce(q, &bio); - __blk_queue_split(&bio, &nr_segs); + if (blk_may_split(q, bio)) + __blk_queue_split(q, &bio, &nr_segs); if (!bio_integrity_prep(bio)) goto queue_exit; - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) - goto queue_exit; - - if (blk_mq_sched_bio_merge(q, bio, nr_segs)) - goto queue_exit; + if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { + if (blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) + goto queue_exit; + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) + goto queue_exit; + } rq_qos_throttle(q, bio); - hipri = bio->bi_opf & REQ_HIPRI; - - data.cmd_flags = bio->bi_opf; - rq = __blk_mq_alloc_request(&data); - if (unlikely(!rq)) { - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - goto queue_exit; + plug = blk_mq_plug(q, bio); + if (plug && plug->cached_rq) { + rq = rq_list_pop(&plug->cached_rq); + INIT_LIST_HEAD(&rq->queuelist); + } else { + struct blk_mq_alloc_data data = { + .q = q, + .nr_tags = 1, + .cmd_flags = bio->bi_opf, + .rq_flags = q->elevator ? RQF_ELV : 0, + }; + + if (plug) { + data.nr_tags = plug->nr_ios; + plug->nr_ios = 1; + data.cached_rq = &plug->cached_rq; + } + rq = __blk_mq_alloc_requests(&data); + if (unlikely(!rq)) { + rq_qos_cleanup(q, bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + goto queue_exit; + } } trace_block_getrq(bio); rq_qos_track(q, rq, bio); - cookie = request_to_qc_t(data.hctx, rq); - blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_init_request(rq); @@ -2217,17 +2548,15 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); - return BLK_QC_T_NONE; + return; } - plug = blk_mq_plug(q, bio); - if (unlikely(is_flush_fua)) { - /* Bypass scheduler for flush requests */ - blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || - blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || - q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) + return; + + if (plug && (q->nr_hw_queues == 1 || + blk_mq_is_shared_tags(rq->mq_hctx->flags) || + q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. @@ -2238,22 +2567,26 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) unsigned int request_count = plug->rq_count; struct request *last = NULL; - if (!request_count) + if (!request_count) { trace_block_plug(q); - else - last = list_entry_rq(plug->mq_list.prev); + } else if (!blk_queue_nomerges(q)) { + last = rq_list_peek(&plug->mq_list); + if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE) + last = NULL; + } - if (request_count >= blk_plug_max_rq_count(plug) || (last && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_flush_plug_list(plug, false); + if (request_count >= blk_plug_max_rq_count(plug) || last) { + blk_mq_flush_plug_list(plug, false); trace_block_plug(q); } blk_add_rq_to_plug(plug, rq); - } else if (q->elevator) { + } else if (rq->rq_flags & RQF_ELV) { /* Insert the request at the IO scheduler queue */ blk_mq_sched_insert_request(rq, false, true, true); } else if (plug && !blk_queue_nomerges(q)) { + struct request *next_rq = NULL; + /* * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be @@ -2261,39 +2594,32 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) * The plug list might get flushed before this. If that happens, * the plug list is empty, and same_queue_rq is invalid. */ - if (list_empty(&plug->mq_list)) - same_queue_rq = NULL; if (same_queue_rq) { - list_del_init(&same_queue_rq->queuelist); + next_rq = rq_list_pop(&plug->mq_list); plug->rq_count--; } blk_add_rq_to_plug(plug, rq); trace_block_plug(q); - if (same_queue_rq) { - data.hctx = same_queue_rq->mq_hctx; + if (next_rq) { trace_block_unplug(q, 1, true); - blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie); + blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq); } } else if ((q->nr_hw_queues > 1 && is_sync) || - !data.hctx->dispatch_busy) { + !rq->mq_hctx->dispatch_busy) { /* * There is no scheduler and we can try to send directly * to the hardware. */ - blk_mq_try_issue_directly(data.hctx, rq, &cookie); + blk_mq_try_issue_directly(rq->mq_hctx, rq); } else { /* Default case. */ blk_mq_sched_insert_request(rq, false, true, true); } - if (!hipri) - return BLK_QC_T_NONE; - return cookie; + return; queue_exit: blk_queue_exit(q); - return BLK_QC_T_NONE; } static size_t order_to_size(unsigned int order) @@ -2302,19 +2628,22 @@ static size_t order_to_size(unsigned int order) } /* called before freeing request pool in @tags */ -static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, + struct blk_mq_tags *tags) { - struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; struct page *page; unsigned long flags; + /* There is no need to clear a driver tags own mapping */ + if (drv_tags == tags) + return; + list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; - for (i = 0; i < set->queue_depth; i++) { + for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; @@ -2338,9 +2667,15 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { + struct blk_mq_tags *drv_tags; struct page *page; - if (tags->rqs && set->ops->exit_request) { + if (blk_mq_is_shared_tags(set->flags)) + drv_tags = set->shared_tags; + else + drv_tags = set->tags[hctx_idx]; + + if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { @@ -2353,7 +2688,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } - blk_mq_clear_rq_mapping(set, tags, hctx_idx); + blk_mq_clear_rq_mapping(drv_tags, tags); while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); @@ -2367,21 +2702,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) +void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); } -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags) +static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; int node; @@ -2390,7 +2724,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; @@ -2398,7 +2733,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; } @@ -2407,7 +2742,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, node); if (!tags->static_rqs) { kfree(tags->rqs); - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; } @@ -2429,8 +2764,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return 0; } -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth) +static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; @@ -2841,37 +3177,58 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } -static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, - int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int depth) { - unsigned int flags = set->flags; - int ret = 0; + struct blk_mq_tags *tags; + int ret; - set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, - set->queue_depth, set->reserved_tags, flags); - if (!set->tags[hctx_idx]) - return false; + tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); + if (!tags) + return NULL; - ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, - set->queue_depth); - if (!ret) - return true; + ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); + if (ret) { + blk_mq_free_rq_map(tags); + return NULL; + } - blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; - return false; + return tags; } -static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + int hctx_idx) { - unsigned int flags = set->flags; + if (blk_mq_is_shared_tags(set->flags)) { + set->tags[hctx_idx] = set->shared_tags; - if (set->tags && set->tags[hctx_idx]) { - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; + return true; } + + set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, + set->queue_depth); + + return set->tags[hctx_idx]; +} + +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx) +{ + if (tags) { + blk_mq_free_rqs(set, tags, hctx_idx); + blk_mq_free_rq_map(tags); + } +} + +static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (!blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); + + set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) @@ -2904,7 +3261,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && - !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this @@ -2951,8 +3308,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) - blk_mq_free_map_and_requests(set, i); + if (i) + __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; @@ -3248,8 +3605,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) - blk_mq_free_map_and_requests(set, j); + __blk_mq_free_map_and_rqs(set, j); blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } @@ -3336,8 +3692,16 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; + if (blk_mq_is_shared_tags(set->flags)) { + set->shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + set->queue_depth); + if (!set->shared_tags) + return -ENOMEM; + } + for (i = 0; i < set->nr_hw_queues; i++) { - if (!__blk_mq_alloc_map_and_request(set, i)) + if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } @@ -3346,7 +3710,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) out_unwind: while (--i >= 0) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); + + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } return -ENOMEM; } @@ -3356,7 +3725,7 @@ out_unwind: * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) +static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; @@ -3522,27 +3891,15 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_mq_map; - ret = blk_mq_alloc_map_and_requests(set); + ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; - if (blk_mq_is_sbitmap_shared(set->flags)) { - atomic_set(&set->active_queues_shared_sbitmap, 0); - - if (blk_mq_init_shared_sbitmap(set)) { - ret = -ENOMEM; - goto out_free_mq_rq_maps; - } - } - mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -out_free_mq_rq_maps: - for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); @@ -3575,10 +3932,12 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) int i, j; for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); - if (blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_exit_shared_sbitmap(set); + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); @@ -3613,20 +3972,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ - if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); - if (!ret && blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_tag_resize_shared_sbitmap(set, nr); - } else { + if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); - if (blk_mq_is_sbitmap_shared(set->flags)) { - hctx->sched_tags->bitmap_tags = - &q->sched_bitmap_tags; - hctx->sched_tags->breserved_tags = - &q->sched_breserved_tags; - } + nr, true); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, + false); } if (ret) break; @@ -3635,9 +3986,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } if (!ret) { q->nr_requests = nr; - if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) - sbitmap_queue_resize(&q->sched_bitmap_tags, - nr - set->reserved_tags); + if (blk_mq_is_shared_tags(set->flags)) { + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q); + else + blk_mq_tag_resize_shared_tags(set, nr); + } } blk_mq_unquiesce_queue(q); @@ -3856,15 +4210,20 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, return ret; } -static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, - struct request *rq) +static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) { + struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); + struct request *rq = blk_qc_to_rq(hctx, qc); struct hrtimer_sleeper hs; enum hrtimer_mode mode; unsigned int nsecs; ktime_t kt; - if (rq->rq_flags & RQF_MQ_POLL_SLEPT) + /* + * If a request has completed on queue that uses an I/O scheduler, we + * won't get back a request from blk_qc_to_rq. + */ + if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) return false; /* @@ -3906,92 +4265,37 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, __set_current_state(TASK_RUNNING); destroy_hrtimer_on_stack(&hs.timer); - return true; -} - -static bool blk_mq_poll_hybrid(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) -{ - struct request *rq; - - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - return false; - - if (!blk_qc_t_is_internal(cookie)) - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); - else { - rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); - /* - * With scheduling, if the request has completed, we'll - * get a NULL return here, as we clear the sched tag when - * that happens. The request still remains valid, like always, - * so we should be safe with just the NULL check. - */ - if (!rq) - return false; - } - - return blk_mq_poll_hybrid_sleep(q, rq); -} - -/** - * blk_poll - poll for IO completions - * @q: the queue - * @cookie: cookie passed back at IO submission time - * @spin: whether to spin for completions - * - * Description: - * Poll for completions on the passed in queue. Returns number of - * completed entries found. If @spin is true, then blk_poll will continue - * looping until at least one completion is found, unless the task is - * otherwise marked running (or we need to reschedule). - */ -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int state; - - if (!blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return 0; - - if (current->plug) - blk_flush_plug_list(current->plug, false); - - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; /* - * If we sleep, have the caller restart the poll loop to reset - * the state. Like for the other success return cases, the - * caller is responsible for checking if the IO completed. If - * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. If specified not to spin, - * we also should not sleep. + * If we sleep, have the caller restart the poll loop to reset the + * state. Like for the other success return cases, the caller is + * responsible for checking if the IO completed. If the IO isn't + * complete, we'll get called again and will go straight to the busy + * poll loop. */ - if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) - return 1; + return true; +} - hctx->poll_considered++; +static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, + struct io_comp_batch *iob, unsigned int flags) +{ + struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); + long state = get_current_state(); + int ret; - state = get_current_state(); do { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx); + ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { - hctx->poll_success++; __set_current_state(TASK_RUNNING); return ret; } if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); - if (task_is_running(current)) return 1; - if (ret < 0 || !spin) + + if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); @@ -3999,7 +4303,17 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) __set_current_state(TASK_RUNNING); return 0; } -EXPORT_SYMBOL_GPL(blk_poll); + +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) +{ + if (!(flags & BLK_POLL_NOSLEEP) && + q->poll_nsec != BLK_MQ_POLL_CLASSIC) { + if (blk_mq_poll_hybrid(q, cookie)) + return 1; + } + return blk_mq_poll_classic(q, cookie, iob, flags); +} unsigned int blk_mq_rq_cpu(struct request *rq) { diff --git a/block/blk-mq.h b/block/blk-mq.h index d08779f77a26..28859fc5faee 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -25,18 +25,14 @@ struct blk_mq_ctx { unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; - /* incremented at dispatch time */ - unsigned long rq_dispatched[2]; - unsigned long rq_merged; - - /* incremented at completion time */ - unsigned long ____cacheline_aligned_in_smp rq_completed[2]; - struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; +void blk_mq_submit_bio(struct bio *bio); +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); @@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq); */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags); -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth); - +void blk_mq_free_rq_map(struct blk_mq_tags *tags); +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, unsigned int depth); +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx); /* * Internal helpers for request insertion into sw queues */ @@ -109,9 +102,9 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, enum hctx_type type = HCTX_TYPE_DEFAULT; /* - * The caller ensure that if REQ_HIPRI, poll must be enabled. + * The caller ensure that if REQ_POLLED, poll must be enabled. */ - if (flags & REQ_HIPRI) + if (flags & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((flags & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; @@ -128,6 +121,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q); extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +void blk_mq_free_plug_rqs(struct blk_plug *plug); +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_release(struct request_queue *q); @@ -154,23 +149,27 @@ struct blk_mq_alloc_data { blk_mq_req_flags_t flags; unsigned int shallow_depth; unsigned int cmd_flags; + unsigned int rq_flags; + + /* allocate multiple requests/tags in one go */ + unsigned int nr_tags; + struct request **cached_rq; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; -static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) +static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (data->q->elevator) - return data->hctx->sched_tags; - - return data->hctx->tags; + if (!(data->rq_flags & RQF_ELV)) + return data->hctx->tags; + return data->hctx->sched_tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) @@ -220,24 +219,24 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq) static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) - atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap); + if (blk_mq_is_shared_tags(hctx->flags)) + atomic_inc(&hctx->queue->nr_active_requests_shared_tags); else atomic_inc(&hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) - atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap); + if (blk_mq_is_shared_tags(hctx->flags)) + atomic_dec(&hctx->queue->nr_active_requests_shared_tags); else atomic_dec(&hctx->nr_active); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) - return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap); + if (blk_mq_is_shared_tags(hctx->flags)) + return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, @@ -260,7 +259,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } -bool blk_mq_get_driver_tag(struct request *rq); +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq); + +static inline bool blk_mq_get_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag != BLK_MQ_NO_TAG && + !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + hctx->tags->rqs[rq->tag] = rq; + return true; + } + + return __blk_mq_get_driver_tag(hctx, rq); +} static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { @@ -331,19 +343,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, if (bt->sb.depth == 1) return true; - if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; - users = atomic_read(&set->active_queues_shared_sbitmap); } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; - users = atomic_read(&hctx->tags->active_queues); } + users = atomic_read(&hctx->tags->active_queues); + if (!users) return true; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f000f83e0621..3cfbc8668cba 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) * BIO_TRACKED lets controllers know that a bio went through the * normal rq_qos path. */ - bio_set_flag(bio, BIO_TRACKED); - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_TRACKED); __rq_qos_throttle(q->rq_qos, bio); + } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, diff --git a/block/blk-settings.c b/block/blk-settings.c index a7c857ad7d10..b880c70e22e4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -842,6 +842,24 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); +static bool disk_has_partitions(struct gendisk *disk) +{ + unsigned long idx; + struct block_device *part; + bool ret = false; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part)) { + ret = true; + break; + } + } + rcu_read_unlock(); + + return ret; +} + /** * blk_queue_set_zoned - configure a disk queue zoned model. * @disk: the gendisk of the queue to configure @@ -876,7 +894,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - !xa_empty(&disk->part_tbl)) + disk_has_partitions(disk)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 614d9d47de36..cef1f713370b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -17,6 +17,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-wbt.h" +#include "blk-throttle.h" struct queue_sysfs_entry { struct attribute attr; @@ -432,26 +433,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page) static ssize_t queue_poll_store(struct request_queue *q, const char *page, size_t count) { - unsigned long poll_on; - ssize_t ret; - - if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL || - !q->tag_set->map[HCTX_TYPE_POLL].nr_queues) + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return -EINVAL; - - ret = queue_var_store(&poll_on, page, count); - if (ret < 0) - return ret; - - if (poll_on) { - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - } else { - blk_mq_freeze_queue(q); - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); - blk_mq_unfreeze_queue(q); - } - - return ret; + pr_info_ratelimited("writes to the poll attribute are ignored.\n"); + pr_info_ratelimited("please use driver specific parameters instead.\n"); + return count; } static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) @@ -887,16 +873,15 @@ int blk_register_queue(struct gendisk *disk) } mutex_lock(&q->sysfs_lock); + + ret = disk_register_independent_access_ranges(disk, NULL); + if (ret) + goto put_dev; + if (q->elevator) { ret = elv_register_queue(q, false); - if (ret) { - mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); - kobject_del(&q->kobj); - blk_trace_remove_sysfs(dev); - kobject_put(&dev->kobj); - return ret; - } + if (ret) + goto put_dev; } blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); @@ -928,6 +913,16 @@ unlock: } return ret; + +put_dev: + disk_unregister_independent_access_ranges(disk); + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); + kobject_del(&q->kobj); + blk_trace_remove_sysfs(dev); + kobject_put(&dev->kobj); + + return ret; } /** @@ -972,6 +967,7 @@ void blk_unregister_queue(struct gendisk *disk) mutex_lock(&q->sysfs_lock); if (q->elevator) elv_unregister_queue(q); + disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7c4e7993ba97..39bb6e68a9a2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -13,6 +13,7 @@ #include <linux/blk-cgroup.h> #include "blk.h" #include "blk-cgroup-rwstat.h" +#include "blk-throttle.h" /* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 @@ -37,60 +38,9 @@ */ #define LATENCY_FILTERED_HD (1000L) /* 1ms */ -static struct blkcg_policy blkcg_policy_throtl; - /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; -/* - * To implement hierarchical throttling, throtl_grps form a tree and bios - * are dispatched upwards level by level until they reach the top and get - * issued. When dispatching bios from the children and local group at each - * level, if the bios are dispatched into a single bio_list, there's a risk - * of a local or child group which can queue many bios at once filling up - * the list starving others. - * - * To avoid such starvation, dispatched bios are queued separately - * according to where they came from. When they are again dispatched to - * the parent, they're popped in round-robin order so that no single source - * hogs the dispatch window. - * - * throtl_qnode is used to keep the queued bios separated by their sources. - * Bios are queued to throtl_qnode which in turn is queued to - * throtl_service_queue and then dispatched in round-robin order. - * - * It's also used to track the reference counts on blkg's. A qnode always - * belongs to a throtl_grp and gets queued on itself or the parent, so - * incrementing the reference of the associated throtl_grp when a qnode is - * queued and decrementing when dequeued is enough to keep the whole blkg - * tree pinned while bios are in flight. - */ -struct throtl_qnode { - struct list_head node; /* service_queue->queued[] */ - struct bio_list bios; /* queued bios */ - struct throtl_grp *tg; /* tg this qnode belongs to */ -}; - -struct throtl_service_queue { - struct throtl_service_queue *parent_sq; /* the parent service_queue */ - - /* - * Bios queued directly to this service_queue or dispatched from - * children throtl_grp's. - */ - struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ - unsigned int nr_queued[2]; /* number of queued bios */ - - /* - * RB tree of active children throtl_grp's, which are sorted by - * their ->disptime. - */ - struct rb_root_cached pending_tree; /* RB tree of active tgs */ - unsigned int nr_pending; /* # queued in the tree */ - unsigned long first_pending_disptime; /* disptime of the first tg */ - struct timer_list pending_timer; /* fires on first_pending_disptime */ -}; - enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ @@ -98,93 +48,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -enum { - LIMIT_LOW, - LIMIT_MAX, - LIMIT_CNT, -}; - -struct throtl_grp { - /* must be the first member */ - struct blkg_policy_data pd; - - /* active throtl group service_queue member */ - struct rb_node rb_node; - - /* throtl_data this group belongs to */ - struct throtl_data *td; - - /* this group's service queue */ - struct throtl_service_queue service_queue; - - /* - * qnode_on_self is used when bios are directly queued to this - * throtl_grp so that local bios compete fairly with bios - * dispatched from children. qnode_on_parent is used when bios are - * dispatched from this throtl_grp into its parent and will compete - * with the sibling qnode_on_parents and the parent's - * qnode_on_self. - */ - struct throtl_qnode qnode_on_self[2]; - struct throtl_qnode qnode_on_parent[2]; - - /* - * Dispatch time in jiffies. This is the estimated time when group - * will unthrottle and is ready to dispatch more bio. It is used as - * key to sort active groups in service tree. - */ - unsigned long disptime; - - unsigned int flags; - - /* are there any throtl rules between this group and td? */ - bool has_rules[2]; - - /* internally used bytes per second rate limits */ - uint64_t bps[2][LIMIT_CNT]; - /* user configured bps limits */ - uint64_t bps_conf[2][LIMIT_CNT]; - - /* internally used IOPS limits */ - unsigned int iops[2][LIMIT_CNT]; - /* user configured IOPS limits */ - unsigned int iops_conf[2][LIMIT_CNT]; - - /* Number of bytes dispatched in current slice */ - uint64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; - - unsigned long last_low_overflow_time[2]; - - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; - - unsigned long last_check_time; - - unsigned long latency_target; /* us */ - unsigned long latency_target_conf; /* us */ - /* When did we start a new slice */ - unsigned long slice_start[2]; - unsigned long slice_end[2]; - - unsigned long last_finish_time; /* ns / 1024 */ - unsigned long checked_last_finish_time; /* ns / 1024 */ - unsigned long avg_idletime; /* ns / 1024 */ - unsigned long idletime_threshold; /* us */ - unsigned long idletime_threshold_conf; /* us */ - - unsigned int bio_cnt; /* total bios */ - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ - unsigned long bio_cnt_reset_time; - - atomic_t io_split_cnt[2]; - atomic_t last_io_split_cnt[2]; - - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; -}; - /* We measure latency for request size from <= 4k to >= 1M */ #define LATENCY_BUCKET_SIZE 9 @@ -231,16 +94,6 @@ struct throtl_data static void throtl_pending_timer_fn(struct timer_list *t); -static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct throtl_grp, pd) : NULL; -} - -static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) -{ - return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); -} - static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); @@ -1794,7 +1647,7 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_work_sync(&td->dispatch_work); } -static struct blkcg_policy blkcg_policy_throtl = { +struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, @@ -2208,9 +2061,9 @@ void blk_throtl_charge_bio_split(struct bio *bio) } while (parent); } -bool blk_throtl_bio(struct bio *bio) +bool __blk_throtl_bio(struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); @@ -2221,19 +2074,12 @@ bool blk_throtl_bio(struct bio *bio) rcu_read_lock(); - /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED)) - goto out; - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); } - if (!tg->has_rules[rw]) - goto out; - spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); @@ -2317,7 +2163,6 @@ again: out_unlock: spin_unlock_irq(&q->queue_lock); -out: bio_set_flag(bio, BIO_THROTTLED); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-throttle.h b/block/blk-throttle.h new file mode 100644 index 000000000000..175f03abd9e4 --- /dev/null +++ b/block/blk-throttle.h @@ -0,0 +1,182 @@ +#ifndef BLK_THROTTLE_H +#define BLK_THROTTLE_H + +#include "blk-cgroup-rwstat.h" + +/* + * To implement hierarchical throttling, throtl_grps form a tree and bios + * are dispatched upwards level by level until they reach the top and get + * issued. When dispatching bios from the children and local group at each + * level, if the bios are dispatched into a single bio_list, there's a risk + * of a local or child group which can queue many bios at once filling up + * the list starving others. + * + * To avoid such starvation, dispatched bios are queued separately + * according to where they came from. When they are again dispatched to + * the parent, they're popped in round-robin order so that no single source + * hogs the dispatch window. + * + * throtl_qnode is used to keep the queued bios separated by their sources. + * Bios are queued to throtl_qnode which in turn is queued to + * throtl_service_queue and then dispatched in round-robin order. + * + * It's also used to track the reference counts on blkg's. A qnode always + * belongs to a throtl_grp and gets queued on itself or the parent, so + * incrementing the reference of the associated throtl_grp when a qnode is + * queued and decrementing when dequeued is enough to keep the whole blkg + * tree pinned while bios are in flight. + */ +struct throtl_qnode { + struct list_head node; /* service_queue->queued[] */ + struct bio_list bios; /* queued bios */ + struct throtl_grp *tg; /* tg this qnode belongs to */ +}; + +struct throtl_service_queue { + struct throtl_service_queue *parent_sq; /* the parent service_queue */ + + /* + * Bios queued directly to this service_queue or dispatched from + * children throtl_grp's. + */ + struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ + unsigned int nr_queued[2]; /* number of queued bios */ + + /* + * RB tree of active children throtl_grp's, which are sorted by + * their ->disptime. + */ + struct rb_root_cached pending_tree; /* RB tree of active tgs */ + unsigned int nr_pending; /* # queued in the tree */ + unsigned long first_pending_disptime; /* disptime of the first tg */ + struct timer_list pending_timer; /* fires on first_pending_disptime */ +}; + +enum { + LIMIT_LOW, + LIMIT_MAX, + LIMIT_CNT, +}; + +struct throtl_grp { + /* must be the first member */ + struct blkg_policy_data pd; + + /* active throtl group service_queue member */ + struct rb_node rb_node; + + /* throtl_data this group belongs to */ + struct throtl_data *td; + + /* this group's service queue */ + struct throtl_service_queue service_queue; + + /* + * qnode_on_self is used when bios are directly queued to this + * throtl_grp so that local bios compete fairly with bios + * dispatched from children. qnode_on_parent is used when bios are + * dispatched from this throtl_grp into its parent and will compete + * with the sibling qnode_on_parents and the parent's + * qnode_on_self. + */ + struct throtl_qnode qnode_on_self[2]; + struct throtl_qnode qnode_on_parent[2]; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + unsigned int flags; + + /* are there any throtl rules between this group and td? */ + bool has_rules[2]; + + /* internally used bytes per second rate limits */ + uint64_t bps[2][LIMIT_CNT]; + /* user configured bps limits */ + uint64_t bps_conf[2][LIMIT_CNT]; + + /* internally used IOPS limits */ + unsigned int iops[2][LIMIT_CNT]; + /* user configured IOPS limits */ + unsigned int iops_conf[2][LIMIT_CNT]; + + /* Number of bytes dispatched in current slice */ + uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; + + unsigned long last_low_overflow_time[2]; + + uint64_t last_bytes_disp[2]; + unsigned int last_io_disp[2]; + + unsigned long last_check_time; + + unsigned long latency_target; /* us */ + unsigned long latency_target_conf; /* us */ + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; + + unsigned long last_finish_time; /* ns / 1024 */ + unsigned long checked_last_finish_time; /* ns / 1024 */ + unsigned long avg_idletime; /* ns / 1024 */ + unsigned long idletime_threshold; /* us */ + unsigned long idletime_threshold_conf; /* us */ + + unsigned int bio_cnt; /* total bios */ + unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ + unsigned long bio_cnt_reset_time; + + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; +}; + +extern struct blkcg_policy blkcg_policy_throtl; + +static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct throtl_grp, pd) : NULL; +} + +static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) +{ + return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); +} + +/* + * Internal throttling interface + */ +#ifndef CONFIG_BLK_DEV_THROTTLING +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } +static inline bool blk_throtl_bio(struct bio *bio) { return false; } +#else /* CONFIG_BLK_DEV_THROTTLING */ +int blk_throtl_init(struct request_queue *q); +void blk_throtl_exit(struct request_queue *q); +void blk_throtl_register_queue(struct request_queue *q); +void blk_throtl_charge_bio_split(struct bio *bio); +bool __blk_throtl_bio(struct bio *bio); +static inline bool blk_throtl_bio(struct bio *bio) +{ + struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + + if (bio_flagged(bio, BIO_THROTTLED)) + return false; + if (!tg->has_rules[bio_data_dir(bio)]) + return false; + + return __blk_throtl_bio(bio); +} +#endif /* CONFIG_BLK_DEV_THROTTLING */ + +#endif diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 874c1c37bf0c..0c119be0e813 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -357,6 +357,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) unsigned int inflight = wbt_inflight(rwb); int status; + if (!rwb->rqos.q->disk) + return; + status = latency_exceeded(rwb, cb->stat); trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, diff --git a/block/blk.h b/block/blk.h index 7d2a0ba7ed21..7afffd548daf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -12,6 +12,8 @@ #include "blk-mq.h" #include "blk-mq-sched.h" +struct elevator_type; + /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -51,6 +53,8 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +void blk_queue_start_drain(struct request_queue *q); #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, @@ -92,6 +96,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } +static inline bool rq_mergeable(struct request *rq) +{ + if (blk_rq_is_passthrough(rq)) + return false; + + if (req_op(rq) == REQ_OP_FLUSH) + return false; + + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + + if (req_op(rq) == REQ_OP_ZONE_APPEND) + return false; + + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) + return false; + if (rq->rq_flags & RQF_NOMERGE_FLAGS) + return false; + + return true; +} + +/* + * There are two different ways to handle DISCARD merges: + * 1) If max_discard_segments > 1, the driver treats every bio as a range and + * send the bios to controller together. The ranges don't need to be + * contiguous. + * 2) Otherwise, the request will be normal read/write requests. The ranges + * need to be contiguous. + */ +static inline bool blk_discard_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD && + queue_max_discard_segments(req->q) > 1) + return true; + return false; +} + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); @@ -173,21 +215,28 @@ static inline void blk_integrity_del(struct gendisk *disk) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); +void blk_print_req_error(struct request *req, blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq); + unsigned int nr_segs, bool *same_queue_rq); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); -void blk_account_io_start(struct request *req); -void blk_account_io_done(struct request *req, u64 now); +void __blk_account_io_start(struct request *req); +void __blk_account_io_done(struct request *req, u64 now); + +/* + * Plug flush limits + */ +#define BLK_MAX_REQUEST_COUNT 32 +#define BLK_PLUG_FLUSH_SIZE (128 * 1024) /* * Internal elevator interface */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) -void blk_insert_flush(struct request *rq); +bool blk_insert_flush(struct request *rq); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); @@ -200,7 +249,7 @@ static inline void elevator_exit(struct request_queue *q, { lockdep_assert_held(&q->sysfs_lock); - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); __elevator_exit(q, e); } @@ -218,7 +267,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); +static inline bool blk_may_split(struct request_queue *q, struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + return true; /* non-trivial splitting decisions */ + default: + break; + } + + /* + * All drivers must accept single-segments bios that are <= PAGE_SIZE. + * This is a quick and dirty check that relies on the fact that + * bi_io_vec[0] is always valid if a bio has data. The check might + * lead to occasional false negatives when bios are cloned, but compared + * to the performance impact of cloned bios themselves the loop below + * doesn't matter anyway. + */ + return q->limits.chunk_sectors || bio->bi_vcnt != 1 || + bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; +} + +void __blk_queue_split(struct request_queue *q, struct bio **bio, + unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, @@ -238,7 +312,25 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); + return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk; +} + +static inline void blk_account_io_done(struct request *req, u64 now) +{ + /* + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. + */ + if (blk_do_io_stat(req) && req->part && + !(req->rq_flags & RQF_FLUSH_SEQ)) + __blk_account_io_done(req, now); +} + +static inline void blk_account_io_start(struct request *req) +{ + if (blk_do_io_stat(req)) + __blk_account_io_start(req); } static inline void req_set_nomerge(struct request_queue *q, struct request *req) @@ -283,22 +375,6 @@ void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); -/* - * Internal throttling interface - */ -#ifdef CONFIG_BLK_DEV_THROTTLING -extern int blk_throtl_init(struct request_queue *q); -extern void blk_throtl_exit(struct request_queue *q); -extern void blk_throtl_register_queue(struct request_queue *q); -extern void blk_throtl_charge_bio_split(struct bio *bio); -bool blk_throtl_bio(struct bio *bio); -#else /* CONFIG_BLK_DEV_THROTTLING */ -static inline int blk_throtl_init(struct request_queue *q) { return 0; } -static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_register_queue(struct request_queue *q) { } -static inline void blk_throtl_charge_bio_split(struct bio *bio) { } -static inline bool blk_throtl_bio(struct bio *bio) { return false; } -#endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, @@ -366,13 +442,20 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; -static inline void bio_clear_hipri(struct bio *bio) +static inline void bio_clear_polled(struct bio *bio) { /* can't support alloc cache if we turn off polling */ bio_clear_flag(bio, BIO_PERCPU_CACHE); - bio->bi_opf &= ~REQ_HIPRI; + bio->bi_opf &= ~REQ_POLLED; } +long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); +long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); + extern const struct address_space_operations def_blk_aops; +int disk_register_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *new_iars); +void disk_unregister_independent_access_ranges(struct gendisk *disk); + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index 05fc7148489d..7af1a72835b9 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mempool.h> #include <linux/blkdev.h> +#include <linux/blk-cgroup.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/hash.h> diff --git a/block/bsg-lib.c b/block/bsg-lib.c index ccb98276c964..10aa378702fa 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, struct bsg_job *job; struct request *rq; struct bio *bio; + void *reply; int ret; if (hdr->protocol != BSG_PROTOCOL_SCSI || @@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, if (!capable(CAP_SYS_RAWIO)) return -EPERM; - rq = blk_get_request(q, hdr->dout_xfer_len ? + rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); rq->timeout = timeout; job = blk_mq_rq_to_pdu(rq); + reply = job->reply; + memset(job, 0, sizeof(*job)); + job->reply = reply; + job->reply_len = SCSI_SENSE_BUFFERSIZE; + job->dd_data = job + 1; + job->request_len = hdr->request_len; job->request = memdup_user(uptr64(hdr->request), hdr->request_len); if (IS_ERR(job->request)) { ret = PTR_ERR(job->request); - goto out_put_request; + goto out_free_rq; } if (hdr->dout_xfer_len && hdr->din_xfer_len) { - job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0); + job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0); if (IS_ERR(job->bidi_rq)) { ret = PTR_ERR(job->bidi_rq); goto out_free_job_request; @@ -134,11 +141,11 @@ out_unmap_bidi_rq: blk_rq_unmap_user(job->bidi_bio); out_free_bidi_rq: if (job->bidi_rq) - blk_put_request(job->bidi_rq); + blk_mq_free_request(job->bidi_rq); out_free_job_request: kfree(job->request); -out_put_request: - blk_put_request(rq); +out_free_rq: + blk_mq_free_request(rq); return ret; } @@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, return 0; } -/* called right before the request is given to the request_queue user */ -static void bsg_initialize_rq(struct request *req) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(req); - void *reply = job->reply; - - memset(job, 0, sizeof(*job)); - job->reply = reply; - job->reply_len = SCSI_SENSE_BUFFERSIZE; - job->dd_data = job + 1; -} - static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx) { @@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = { .queue_rq = bsg_queue_rq, .init_request = bsg_init_rq, .exit_request = bsg_exit_rq, - .initialize_rq_fn = bsg_initialize_rq, .complete = bsg_complete, .timeout = bsg_timeout, }; diff --git a/block/elevator.c b/block/elevator.c index ff45d8388f48..1f39f6e8ebb9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -26,7 +26,6 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> @@ -40,6 +39,7 @@ #include <trace/events/block.h> +#include "elevator.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" @@ -637,7 +637,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) return NULL; if (q->nr_hw_queues != 1 && - !blk_mq_is_sbitmap_shared(q->tag_set->flags)) + !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; return elevator_get(q, "mq-deadline", false); diff --git a/include/linux/elevator.h b/block/elevator.h index ef9ceead3db1..16cd8bdedb7e 100644 --- a/include/linux/elevator.h +++ b/block/elevator.h @@ -1,17 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_ELEVATOR_H -#define _LINUX_ELEVATOR_H +#ifndef _ELEVATOR_H +#define _ELEVATOR_H #include <linux/percpu.h> #include <linux/hashtable.h> -#ifdef CONFIG_BLOCK - struct io_cq; struct elevator_type; -#ifdef CONFIG_BLK_DEBUG_FS struct blk_mq_debugfs_attr; -#endif /* * Return values from elevator merger @@ -162,20 +158,9 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_SORT_MERGE 6 -#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) -/* - * Elevator features. - */ - -/* Supports zoned block devices sequential write constraint */ -#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) -/* Supports scheduling on multiple hardware queues */ -#define ELEVATOR_F_MQ_AWARE (1U << 1) - -#endif /* CONFIG_BLOCK */ -#endif +#endif /* _ELEVATOR_H */ diff --git a/block/fops.c b/block/fops.c index 1e970c247e0e..4e22b0794c82 100644 --- a/block/fops.c +++ b/block/fops.c @@ -17,7 +17,7 @@ #include <linux/fs.h> #include "blk.h" -static struct inode *bdev_file_inode(struct file *file) +static inline struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } @@ -54,14 +54,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio) static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct block_device *bdev = iocb->ki_filp->private_data; struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; - blk_qc_t qc; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -78,7 +76,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, vecs, nr_pages); bio_set_dev(&bio, bdev); - bio.bi_iter.bi_sector = pos >> 9; + bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -102,13 +100,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); - qc = submit_bio(&bio); + submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio.bi_private)) break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) + if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -126,6 +123,11 @@ out: return ret; } +enum { + DIO_SHOULD_DIRTY = 1, + DIO_IS_SYNC = 2, +}; + struct blkdev_dio { union { struct kiocb *iocb; @@ -133,35 +135,27 @@ struct blkdev_dio { }; size_t size; atomic_t ref; - bool multi_bio : 1; - bool should_dirty : 1; - bool is_sync : 1; - struct bio bio; + unsigned int flags; + struct bio bio ____cacheline_aligned_in_smp; }; static struct bio_set blkdev_dio_pool; -static int blkdev_iopoll(struct kiocb *kiocb, bool wait) -{ - struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); - struct request_queue *q = bdev_get_queue(bdev); - - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); -} - static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; + bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; - if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { - if (!dio->is_sync) { + if (atomic_dec_and_test(&dio->ref)) { + if (!(dio->flags & DIO_IS_SYNC)) { struct kiocb *iocb = dio->iocb; ssize_t ret; + WRITE_ONCE(iocb->private, NULL); + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; @@ -169,9 +163,8 @@ static void blkdev_bio_end_io(struct bio *bio) ret = blk_status_to_errno(dio->bio.bi_status); } - dio->iocb->ki_complete(iocb, ret, 0); - if (dio->multi_bio) - bio_put(&dio->bio); + dio->iocb->ki_complete(iocb, ret); + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; @@ -191,16 +184,12 @@ static void blkdev_bio_end_io(struct bio *bio) static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct file *file = iocb->ki_filp; - struct inode *inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(inode); + struct block_device *bdev = iocb->ki_filp->private_data; struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; - bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; - blk_qc_t qc = BLK_QC_T_NONE; int ret = 0; if ((pos | iov_iter_alignment(iter)) & @@ -210,28 +199,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); - dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) { + atomic_set(&dio->ref, 1); + /* + * Grab an extra reference to ensure the dio structure which is embedded + * into the first bio stays around. + */ + bio_get(bio); + + is_sync = is_sync_kiocb(iocb); + if (is_sync) { + dio->flags = DIO_IS_SYNC; dio->waiter = current; - bio_get(bio); } else { + dio->flags = 0; dio->iocb = iocb; } dio->size = 0; - dio->multi_bio = false; - dio->should_dirty = is_read && iter_is_iovec(iter); + if (is_read && iter_is_iovec(iter)) + dio->flags |= DIO_SHOULD_DIRTY; - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!is_poll) - blk_start_plug(&plug); + blk_start_plug(&plug); for (;;) { bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = pos >> 9; + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; @@ -246,7 +238,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (is_read) { bio->bi_opf = REQ_OP_READ; - if (dio->should_dirty) + if (dio->flags & DIO_SHOULD_DIRTY) bio_set_pages_dirty(bio); } else { bio->bi_opf = dio_bio_write_op(iocb); @@ -260,40 +252,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { - bool polled = false; - - if (iocb->ki_flags & IOCB_HIPRI) { - bio_set_polled(bio, iocb); - polled = true; - } - - qc = submit_bio(bio); - - if (polled) - WRITE_ONCE(iocb->ki_cookie, qc); + submit_bio(bio); break; } - - if (!dio->multi_bio) { - /* - * AIO needs an extra reference to ensure the dio - * structure which is embedded into the first bio - * stays around. - */ - if (!is_sync) - bio_get(bio); - dio->multi_bio = true; - atomic_set(&dio->ref, 2); - } else { - atomic_inc(&dio->ref); - } - + atomic_inc(&dio->ref); submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - if (!is_poll) - blk_finish_plug(&plug); + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; @@ -302,10 +269,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; - - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -318,6 +282,94 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static void blkdev_bio_end_io_async(struct bio *bio) +{ + struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (likely(!bio->bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(bio->bi_status); + } + + iocb->ki_complete(iocb, ret); + + if (dio->flags & DIO_SHOULD_DIRTY) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, + struct iov_iter *iter, + unsigned int nr_pages) +{ + struct block_device *bdev = iocb->ki_filp->private_data; + struct blkdev_dio *dio; + struct bio *bio; + loff_t pos = iocb->ki_pos; + int ret = 0; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + dio = container_of(bio, struct blkdev_dio, bio); + dio->flags = 0; + dio->iocb = iocb; + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_end_io = blkdev_bio_end_io_async; + bio->bi_ioprio = iocb->ki_ioprio; + + if (iov_iter_is_bvec(iter)) { + /* + * Users don't rely on the iterator being in any particular + * state for async I/O returning -EIOCBQUEUED, hence we can + * avoid expensive iov_iter_advance(). Bypass + * bio_iov_iter_get_pages() and set the bvec directly. + */ + bio_iov_bvec_set(bio, iter); + } else { + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return ret; + } + } + dio->size = bio->bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio->bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) { + dio->flags |= DIO_SHOULD_DIRTY; + bio_set_pages_dirty(bio); + } + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + + if (iocb->ki_flags & IOCB_HIPRI) { + bio->bi_opf |= REQ_POLLED | REQ_NOWAIT; + submit_bio(bio); + WRITE_ONCE(iocb->private, bio); + } else { + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + submit_bio(bio); + } + return -EIOCBQUEUED; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { unsigned int nr_pages; @@ -326,9 +378,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - + if (likely(nr_pages <= BIO_MAX_VECS)) { + if (is_sync_kiocb(iocb)) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + return __blkdev_direct_IO_async(iocb, iter, nr_pages); + } return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } @@ -405,8 +459,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = bdev_file_inode(filp); - struct block_device *bdev = I_BDEV(bd_inode); + struct block_device *bdev = filp->private_data; int error; error = file_write_and_wait_range(filp, start, end); @@ -448,6 +501,8 @@ static int blkdev_open(struct inode *inode, struct file *filp) bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); if (IS_ERR(bdev)) return PTR_ERR(bdev); + + filp->private_data = bdev; filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return 0; @@ -455,29 +510,12 @@ static int blkdev_open(struct inode *inode, struct file *filp) static int blkdev_close(struct inode *inode, struct file *filp) { - struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); + struct block_device *bdev = filp->private_data; blkdev_put(bdev, filp->f_mode); return 0; } -static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - return blkdev_ioctl(bdev, mode, cmd, arg); -} - /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -487,14 +525,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = iocb->ki_filp->private_data; + struct inode *bd_inode = bdev->bd_inode; loff_t size = i_size_read(bd_inode); struct blk_plug plug; size_t shorted = 0; ssize_t ret; - if (bdev_read_only(I_BDEV(bd_inode))) + if (bdev_read_only(bdev)) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) @@ -526,24 +564,26 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); + struct block_device *bdev = iocb->ki_filp->private_data; + loff_t size = i_size_read(bdev->bd_inode); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret; - if (pos >= size) - return 0; - - size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; - iov_iter_truncate(to, size); + if (unlikely(pos + iov_iter_count(to) > size)) { + if (pos >= size) + return 0; + size -= pos; + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } } ret = generic_file_read_iter(iocb, to); - iov_iter_reexpand(to, iov_iter_count(to) + shorted); + + if (unlikely(shorted)) + iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } @@ -565,7 +605,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return -EOPNOTSUPP; /* Don't go off the end of the device. */ - isize = i_size_read(bdev->bd_inode); + isize = bdev_nr_bytes(bdev); if (start >= isize) return -EINVAL; if (end >= isize) { @@ -592,16 +632,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); + error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, 0); break; default: error = -EOPNOTSUPP; @@ -618,10 +660,10 @@ const struct file_operations def_blk_fops = { .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .iopoll = blkdev_iopoll, + .iopoll = iocb_bio_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, - .unlocked_ioctl = block_ioctl, + .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif diff --git a/block/genhd.c b/block/genhd.c index 7b6e5e1cf956..febaaa55125a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -19,6 +19,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/kmod.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> @@ -26,6 +27,7 @@ #include <linux/badblocks.h> #include "blk.h" +#include "blk-rq-qos.h" static struct kobject *block_depr; @@ -56,6 +58,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors) spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } EXPORT_SYMBOL(set_capacity); @@ -559,6 +562,8 @@ EXPORT_SYMBOL(device_add_disk); */ void del_gendisk(struct gendisk *disk) { + struct request_queue *q = disk->queue; + might_sleep(); if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) @@ -575,8 +580,17 @@ void del_gendisk(struct gendisk *disk) fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); + /* + * Fail any new I/O. + */ + set_bit(GD_DEAD, &disk->state); set_capacity(disk, 0); + /* + * Prevent new I/O from crossing bio_queue_enter(). + */ + blk_queue_start_drain(q); + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -598,9 +612,41 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); + + blk_mq_freeze_queue_wait(q); + + rq_qos_exit(q); + blk_sync_queue(q); + blk_flush_integrity(); + /* + * Allow using passthrough request again after the queue is torn down. + */ + blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + __blk_mq_unfreeze_queue(q, true); + } EXPORT_SYMBOL(del_gendisk); +/** + * invalidate_disk - invalidate the disk + * @disk: the struct gendisk to invalidate + * + * A helper to invalidates the disk. It will clean the disk's associated + * buffer/page caches and reset its internal states so that the disk + * can be reused by the drivers. + * + * Context: can sleep + */ +void invalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev = disk->part0; + + invalidate_bdev(bdev); + bdev->bd_inode->i_mapping->wb_err = 0; + set_capacity(disk, 0); +} +EXPORT_SYMBOL(invalidate_disk); + /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, @@ -860,7 +906,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); - struct request_queue *q = bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bdev); struct disk_stats stat; unsigned int inflight; @@ -904,7 +950,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); - struct request_queue *q = bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2]; if (queue_is_mq(q)) @@ -1056,6 +1102,7 @@ static void disk_release(struct device *dev) struct gendisk *disk = dev_to_disk(dev); might_sleep(); + WARN_ON_ONCE(disk_live(disk)); disk_release_events(disk); kfree(disk->random); @@ -1243,6 +1290,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (!disk->bdi) goto out_free_disk; + /* bdev_alloc() might need the queue, set before the first call */ + disk->queue = q; + disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) goto out_free_bdi; @@ -1258,7 +1308,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); - disk->queue = q; q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED @@ -1268,6 +1317,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, out_destroy_part_tbl: xa_destroy(&disk->part_tbl); + disk->part0->bd_disk = NULL; iput(disk->part0->bd_inode); out_free_bdi: bdi_put(disk->bdi); @@ -1362,12 +1412,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only) } EXPORT_SYMBOL(set_disk_ro); -int bdev_read_only(struct block_device *bdev) -{ - return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); -} -EXPORT_SYMBOL(bdev_read_only); - void inc_diskseq(struct gendisk *disk) { disk->diskseq = atomic64_inc_return(&diskseq); diff --git a/block/holder.c b/block/holder.c index 9dc084182337..27cddce1b446 100644 --- a/block/holder.c +++ b/block/holder.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/genhd.h> +#include <linux/slab.h> struct bd_holder_disk { struct list_head list; diff --git a/block/ioctl.c b/block/ioctl.c index eb0491e90b9a..d6af0ac97e57 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -132,7 +132,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, if (len & 511) return -EINVAL; - if (start + len > i_size_read(bdev->bd_inode)) + if (start + len > bdev_nr_bytes(bdev)) return -EINVAL; err = truncate_bdev_range(bdev, mode, start, start + len - 1); @@ -164,7 +164,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, return -EINVAL; if (len & 511) return -EINVAL; - if (end >= (uint64_t)i_size_read(bdev->bd_inode)) + if (end >= (uint64_t)bdev_nr_bytes(bdev)) return -EINVAL; if (end < start) return -EINVAL; @@ -538,12 +538,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, * * New commands must be compatible and go into blkdev_common_ioctl */ -int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - int ret; - loff_t size; + struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; + fmode_t mode = file->f_mode; + int ret; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; switch (cmd) { /* These need separate implementations for the data structure */ @@ -560,10 +569,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; - return put_ulong(argp, size >> 9); + return put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ @@ -571,7 +579,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKBSZSET: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP: @@ -588,7 +596,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -ENOTTY; return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } -EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ #ifdef CONFIG_COMPAT @@ -606,7 +613,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; - loff_t size; /* * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have @@ -632,10 +638,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return compat_put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; - return compat_put_ulong(argp, size >> 9); + return compat_put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ @@ -643,7 +648,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZSET_32: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64_32: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP32: diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c deleted file mode 100644 index 2c4a55bea6ca..000000000000 --- a/block/keyslot-manager.c +++ /dev/null @@ -1,578 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2019 Google LLC - */ - -/** - * DOC: The Keyslot Manager - * - * Many devices with inline encryption support have a limited number of "slots" - * into which encryption contexts may be programmed, and requests can be tagged - * with a slot number to specify the key to use for en/decryption. - * - * As the number of slots is limited, and programming keys is expensive on - * many inline encryption hardware, we don't want to program the same key into - * multiple slots - if multiple requests are using the same key, we want to - * program just one slot with that key and use that slot for all requests. - * - * The keyslot manager manages these keyslots appropriately, and also acts as - * an abstraction between the inline encryption hardware and the upper layers. - * - * Lower layer devices will set up a keyslot manager in their request queue - * and tell it how to perform device specific operations like programming/ - * evicting keys from keyslots. - * - * Upper layers will call blk_ksm_get_slot_for_key() to program a - * key into some slot in the inline encryption hardware. - */ - -#define pr_fmt(fmt) "blk-crypto: " fmt - -#include <linux/keyslot-manager.h> -#include <linux/device.h> -#include <linux/atomic.h> -#include <linux/mutex.h> -#include <linux/pm_runtime.h> -#include <linux/wait.h> -#include <linux/blkdev.h> - -struct blk_ksm_keyslot { - atomic_t slot_refs; - struct list_head idle_slot_node; - struct hlist_node hash_node; - const struct blk_crypto_key *key; - struct blk_keyslot_manager *ksm; -}; - -static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm) -{ - /* - * Calling into the driver requires ksm->lock held and the device - * resumed. But we must resume the device first, since that can acquire - * and release ksm->lock via blk_ksm_reprogram_all_keys(). - */ - if (ksm->dev) - pm_runtime_get_sync(ksm->dev); - down_write(&ksm->lock); -} - -static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) -{ - up_write(&ksm->lock); - if (ksm->dev) - pm_runtime_put_sync(ksm->dev); -} - -static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm) -{ - return ksm->num_slots == 0; -} - -/** - * blk_ksm_init() - Initialize a keyslot manager - * @ksm: The keyslot_manager to initialize. - * @num_slots: The number of key slots to manage. - * - * Allocate memory for keyslots and initialize a keyslot manager. Called by - * e.g. storage drivers to set up a keyslot manager in their request_queue. - * - * Return: 0 on success, or else a negative error code. - */ -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) -{ - unsigned int slot; - unsigned int i; - unsigned int slot_hashtable_size; - - memset(ksm, 0, sizeof(*ksm)); - - if (num_slots == 0) - return -EINVAL; - - ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL); - if (!ksm->slots) - return -ENOMEM; - - ksm->num_slots = num_slots; - - init_rwsem(&ksm->lock); - - init_waitqueue_head(&ksm->idle_slots_wait_queue); - INIT_LIST_HEAD(&ksm->idle_slots); - - for (slot = 0; slot < num_slots; slot++) { - ksm->slots[slot].ksm = ksm; - list_add_tail(&ksm->slots[slot].idle_slot_node, - &ksm->idle_slots); - } - - spin_lock_init(&ksm->idle_slots_lock); - - slot_hashtable_size = roundup_pow_of_two(num_slots); - /* - * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 - * buckets. This only makes a difference when there is only 1 keyslot. - */ - if (slot_hashtable_size < 2) - slot_hashtable_size = 2; - - ksm->log_slot_ht_size = ilog2(slot_hashtable_size); - ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size, - sizeof(ksm->slot_hashtable[0]), - GFP_KERNEL); - if (!ksm->slot_hashtable) - goto err_destroy_ksm; - for (i = 0; i < slot_hashtable_size; i++) - INIT_HLIST_HEAD(&ksm->slot_hashtable[i]); - - return 0; - -err_destroy_ksm: - blk_ksm_destroy(ksm); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_ksm_init); - -static void blk_ksm_destroy_callback(void *ksm) -{ - blk_ksm_destroy(ksm); -} - -/** - * devm_blk_ksm_init() - Resource-managed blk_ksm_init() - * @dev: The device which owns the blk_keyslot_manager. - * @ksm: The blk_keyslot_manager to initialize. - * @num_slots: The number of key slots to manage. - * - * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically - * on driver detach. - * - * Return: 0 on success, or else a negative error code. - */ -int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, - unsigned int num_slots) -{ - int err = blk_ksm_init(ksm, num_slots); - - if (err) - return err; - - return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm); -} -EXPORT_SYMBOL_GPL(devm_blk_ksm_init); - -static inline struct hlist_head * -blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)]; -} - -static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm = slot->ksm; - unsigned long flags; - - spin_lock_irqsave(&ksm->idle_slots_lock, flags); - list_del(&slot->idle_slot_node); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); -} - -static struct blk_ksm_keyslot *blk_ksm_find_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key); - struct blk_ksm_keyslot *slotp; - - hlist_for_each_entry(slotp, head, hash_node) { - if (slotp->key == key) - return slotp; - } - return NULL; -} - -static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - return NULL; - if (atomic_inc_return(&slot->slot_refs) == 1) { - /* Took first reference to this slot; remove it from LRU list */ - blk_ksm_remove_slot_from_lru_list(slot); - } - return slot; -} - -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot) -{ - return slot - slot->ksm->slots; -} -EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx); - -/** - * blk_ksm_get_slot_for_key() - Program a key into a keyslot. - * @ksm: The keyslot manager to program the key into. - * @key: Pointer to the key object to program, including the raw key, crypto - * mode, and data unit size. - * @slot_ptr: A pointer to return the pointer of the allocated keyslot. - * - * Get a keyslot that's been programmed with the specified key. If one already - * exists, return it with incremented refcount. Otherwise, wait for a keyslot - * to become idle and program it. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the - * allocated keyslot), or some other blk_status_t otherwise (and - * keyslot is set to NULL). - */ -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr) -{ - struct blk_ksm_keyslot *slot; - int slot_idx; - int err; - - *slot_ptr = NULL; - - if (blk_ksm_is_passthrough(ksm)) - return BLK_STS_OK; - - down_read(&ksm->lock); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - up_read(&ksm->lock); - if (slot) - goto success; - - for (;;) { - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - if (slot) { - blk_ksm_hw_exit(ksm); - goto success; - } - - /* - * If we're here, that means there wasn't a slot that was - * already programmed with the key. So try to program it. - */ - if (!list_empty(&ksm->idle_slots)) - break; - - blk_ksm_hw_exit(ksm); - wait_event(ksm->idle_slots_wait_queue, - !list_empty(&ksm->idle_slots)); - } - - slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot, - idle_slot_node); - slot_idx = blk_ksm_get_slot_idx(slot); - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx); - if (err) { - wake_up(&ksm->idle_slots_wait_queue); - blk_ksm_hw_exit(ksm); - return errno_to_blk_status(err); - } - - /* Move this slot to the hash list for the new key. */ - if (slot->key) - hlist_del(&slot->hash_node); - slot->key = key; - hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key)); - - atomic_set(&slot->slot_refs, 1); - - blk_ksm_remove_slot_from_lru_list(slot); - - blk_ksm_hw_exit(ksm); -success: - *slot_ptr = slot; - return BLK_STS_OK; -} - -/** - * blk_ksm_put_slot() - Release a reference to a slot - * @slot: The keyslot to release the reference of. - * - * Context: Any context. - */ -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm; - unsigned long flags; - - if (!slot) - return; - - ksm = slot->ksm; - - if (atomic_dec_and_lock_irqsave(&slot->slot_refs, - &ksm->idle_slots_lock, flags)) { - list_add_tail(&slot->idle_slot_node, &ksm->idle_slots); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); - wake_up(&ksm->idle_slots_wait_queue); - } -} - -/** - * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is - * supported by a ksm. - * @ksm: The keyslot manager to check - * @cfg: The crypto configuration to check for. - * - * Checks for crypto_mode/data unit size/dun bytes support. - * - * Return: Whether or not this ksm supports the specified crypto config. - */ -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg) -{ - if (!ksm) - return false; - if (!(ksm->crypto_modes_supported[cfg->crypto_mode] & - cfg->data_unit_size)) - return false; - if (ksm->max_dun_bytes_supported < cfg->dun_bytes) - return false; - return true; -} - -/** - * blk_ksm_evict_key() - Evict a key from the lower layer device. - * @ksm: The keyslot manager to evict from - * @key: The key to evict - * - * Find the keyslot that the specified key was programmed into, and evict that - * slot from the lower layer device. The slot must not be in use by any - * in-flight IO when this function is called. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY - * if the keyslot is still in use, or another -errno value on other - * error. - */ -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - int err = 0; - - if (blk_ksm_is_passthrough(ksm)) { - if (ksm->ksm_ll_ops.keyslot_evict) { - blk_ksm_hw_enter(ksm); - err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1); - blk_ksm_hw_exit(ksm); - return err; - } - return 0; - } - - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - goto out_unlock; - - if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { - err = -EBUSY; - goto out_unlock; - } - err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, - blk_ksm_get_slot_idx(slot)); - if (err) - goto out_unlock; - - hlist_del(&slot->hash_node); - slot->key = NULL; - err = 0; -out_unlock: - blk_ksm_hw_exit(ksm); - return err; -} - -/** - * blk_ksm_reprogram_all_keys() - Re-program all keyslots. - * @ksm: The keyslot manager - * - * Re-program all keyslots that are supposed to have a key programmed. This is - * intended only for use by drivers for hardware that loses its keys on reset. - * - * Context: Process context. Takes and releases ksm->lock. - */ -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) -{ - unsigned int slot; - - if (blk_ksm_is_passthrough(ksm)) - return; - - /* This is for device initialization, so don't resume the device */ - down_write(&ksm->lock); - for (slot = 0; slot < ksm->num_slots; slot++) { - const struct blk_crypto_key *key = ksm->slots[slot].key; - int err; - - if (!key) - continue; - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot); - WARN_ON(err); - } - up_write(&ksm->lock); -} -EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys); - -void blk_ksm_destroy(struct blk_keyslot_manager *ksm) -{ - if (!ksm) - return; - kvfree(ksm->slot_hashtable); - kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots); - memzero_explicit(ksm, sizeof(*ksm)); -} -EXPORT_SYMBOL_GPL(blk_ksm_destroy); - -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) -{ - if (blk_integrity_queue_supports_integrity(q)) { - pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - return false; - } - q->ksm = ksm; - return true; -} -EXPORT_SYMBOL_GPL(blk_ksm_register); - -void blk_ksm_unregister(struct request_queue *q) -{ - q->ksm = NULL; -} - -/** - * blk_ksm_intersect_modes() - restrict supported modes by child device - * @parent: The keyslot manager for parent device - * @child: The keyslot manager for child device, or NULL - * - * Clear any crypto mode support bits in @parent that aren't set in @child. - * If @child is NULL, then all parent bits are cleared. - * - * Only use this when setting up the keyslot manager for a layered device, - * before it's been exposed yet. - */ -void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, - const struct blk_keyslot_manager *child) -{ - if (child) { - unsigned int i; - - parent->max_dun_bytes_supported = - min(parent->max_dun_bytes_supported, - child->max_dun_bytes_supported); - for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported); - i++) { - parent->crypto_modes_supported[i] &= - child->crypto_modes_supported[i]; - } - } else { - parent->max_dun_bytes_supported = 0; - memset(parent->crypto_modes_supported, 0, - sizeof(parent->crypto_modes_supported)); - } -} -EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes); - -/** - * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes - * and DUN bytes that another KSM supports. Here, - * "superset" refers to the mathematical meaning of the - * word - i.e. if two KSMs have the *same* capabilities, - * they *are* considered supersets of each other. - * @ksm_superset: The KSM that we want to verify is a superset - * @ksm_subset: The KSM that we want to verify is a subset - * - * Return: True if @ksm_superset supports a superset of the crypto modes and DUN - * bytes that @ksm_subset supports. - */ -bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, - struct blk_keyslot_manager *ksm_subset) -{ - int i; - - if (!ksm_subset) - return true; - - if (!ksm_superset) - return false; - - for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) { - if (ksm_subset->crypto_modes_supported[i] & - (~ksm_superset->crypto_modes_supported[i])) { - return false; - } - } - - if (ksm_subset->max_dun_bytes_supported > - ksm_superset->max_dun_bytes_supported) { - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(blk_ksm_is_superset); - -/** - * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of - * another KSM - * @target_ksm: The KSM whose restrictions to update. - * @reference_ksm: The KSM to whose restrictions this function will update - * @target_ksm's restrictions to. - * - * Blk-crypto requires that crypto capabilities that were - * advertised when a bio was created continue to be supported by the - * device until that bio is ended. This is turn means that a device cannot - * shrink its advertised crypto capabilities without any explicit - * synchronization with upper layers. So if there's no such explicit - * synchronization, @reference_ksm must support all the crypto capabilities that - * @target_ksm does - * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true). - * - * Note also that as long as the crypto capabilities are being expanded, the - * order of updates becoming visible is not important because it's alright - * for blk-crypto to see stale values - they only cause blk-crypto to - * believe that a crypto capability isn't supported when it actually is (which - * might result in blk-crypto-fallback being used if available, or the bio being - * failed). - */ -void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, - struct blk_keyslot_manager *reference_ksm) -{ - memcpy(target_ksm->crypto_modes_supported, - reference_ksm->crypto_modes_supported, - sizeof(target_ksm->crypto_modes_supported)); - - target_ksm->max_dun_bytes_supported = - reference_ksm->max_dun_bytes_supported; -} -EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities); - -/** - * blk_ksm_init_passthrough() - Init a passthrough keyslot manager - * @ksm: The keyslot manager to init - * - * Initialize a passthrough keyslot manager. - * Called by e.g. storage drivers to set up a keyslot manager in their - * request_queue, when the storage driver wants to manage its keys by itself. - * This is useful for inline encryption hardware that doesn't have the concept - * of keyslots, and for layered devices. - */ -void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm) -{ - memset(ksm, 0, sizeof(*ksm)); - init_rwsem(&ksm->lock); -} -EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 15a8be57203d..fdd74a4df56f 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -9,12 +9,12 @@ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> -#include <linux/elevator.h> #include <linux/module.h> #include <linux/sbitmap.h> #include <trace/events/block.h> +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -151,6 +151,7 @@ struct kyber_ctx_queue { struct kyber_queue_data { struct request_queue *q; + dev_t dev; /* * Each scheduling domain has a limited number of in-flight requests @@ -257,7 +258,7 @@ static int calculate_percentile(struct kyber_queue_data *kqd, } memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); - trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain], kyber_latency_type_names[type], percentile, bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); @@ -270,7 +271,7 @@ static void kyber_resize_domain(struct kyber_queue_data *kqd, depth = clamp(depth, 1U, kyber_depth[sched_domain]); if (depth != kqd->domain_tokens[sched_domain].sb.depth) { sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); - trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain], depth); } } @@ -366,6 +367,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) goto err; kqd->q = q; + kqd->dev = disk_devt(q->disk); kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, GFP_KERNEL | __GFP_ZERO); @@ -451,11 +453,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int shift = tags->bitmap_tags->sb.shift; + unsigned int shift = tags->bitmap_tags.sb.shift; kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); } static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) @@ -774,7 +776,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { @@ -787,7 +789,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 7f3c3932b723..85d919bf60c7 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -9,7 +9,6 @@ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> -#include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> @@ -20,6 +19,7 @@ #include <trace/events/block.h> +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -31,6 +31,11 @@ */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +/* + * Time after which to dispatch lower priority requests even if higher + * priority requests are pending. + */ +static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ @@ -51,17 +56,16 @@ enum dd_prio { enum { DD_PRIO_COUNT = 3 }; -/* I/O statistics per I/O priority. */ +/* + * I/O statistics per I/O priority. It is fine if these counters overflow. + * What matters is that these counters are at least as wide as + * log2(max_outstanding_requests). + */ struct io_stats_per_prio { - local_t inserted; - local_t merged; - local_t dispatched; - local_t completed; -}; - -/* I/O statistics for all I/O priorities (enum dd_prio). */ -struct io_stats { - struct io_stats_per_prio stats[DD_PRIO_COUNT]; + uint32_t inserted; + uint32_t merged; + uint32_t dispatched; + atomic_t completed; }; /* @@ -74,6 +78,7 @@ struct dd_per_prio { struct list_head fifo_list[DD_DIR_COUNT]; /* Next request in FIFO order. Read, write or both are NULL. */ struct request *next_rq[DD_DIR_COUNT]; + struct io_stats_per_prio stats; }; struct deadline_data { @@ -88,8 +93,6 @@ struct deadline_data { unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ - struct io_stats __percpu *stats; - /* * settings that change how the i/o scheduler behaves */ @@ -98,38 +101,12 @@ struct deadline_data { int writes_starved; int front_merges; u32 async_depth; + int prio_aging_expire; spinlock_t lock; spinlock_t zone_lock; }; -/* Count one event of type 'event_type' and with I/O priority 'prio' */ -#define dd_count(dd, event_type, prio) do { \ - struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - local_inc(&io_stats->stats[(prio)].event_type); \ - put_cpu_ptr(io_stats); \ -} while (0) - -/* - * Returns the total number of dd_count(dd, event_type, prio) calls across all - * CPUs. No locking or barriers since it is fine if the returned sum is slightly - * outdated. - */ -#define dd_sum(dd, event_type, prio) ({ \ - unsigned int cpu; \ - u32 sum = 0; \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - for_each_present_cpu(cpu) \ - sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ - stats[(prio)].event_type); \ - sum; \ -}) - /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, @@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, merged, prio); + lockdep_assert_held(&dd->lock); + + dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq @@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, deadline_remove_request(rq->q, per_prio, rq); } +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +{ + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; + + lockdep_assert_held(&dd->lock); + + return stats->inserted - atomic_read(&stats->completed); +} + /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) @@ -356,11 +345,26 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, } /* + * Returns true if and only if @rq started after @latest_start where + * @latest_start is in jiffies. + */ +static bool started_after(struct deadline_data *dd, struct request *rq, + unsigned long latest_start) +{ + unsigned long start_time = (unsigned long)rq->fifo_time; + + start_time -= dd->fifo_expire[rq_data_dir(rq)]; + + return time_after(start_time, latest_start); +} + +/* * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc + * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, - struct dd_per_prio *per_prio) + struct dd_per_prio *per_prio, + unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; @@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); + if (started_after(dd, rq, latest_start)) + return NULL; list_del_init(&rq->queuelist); goto done; } @@ -449,6 +455,9 @@ dispatch_find_request: dd->batching = 0; dispatch_request: + if (started_after(dd, rq, latest_start)) + return NULL; + /* * rq is the selected appropriate request. */ @@ -457,7 +466,7 @@ dispatch_request: done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, dispatched, prio); + dd->per_prio[prio].stats.dispatched++; /* * If the request needs its target zone locked, do it. */ @@ -467,6 +476,34 @@ done: } /* + * Check whether there are any requests with priority other than DD_RT_PRIO + * that were inserted more than prio_aging_expire jiffies ago. + */ +static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, + unsigned long now) +{ + struct request *rq; + enum dd_prio prio; + int prio_cnt; + + lockdep_assert_held(&dd->lock); + + prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + + !!dd_queued(dd, DD_IDLE_PRIO); + if (prio_cnt < 2) + return NULL; + + for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], + now - dd->prio_aging_expire); + if (rq) + return rq; + } + + return NULL; +} + +/* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific @@ -477,15 +514,26 @@ done: static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); + rq = dd_dispatch_prio_aged_requests(dd, now); + if (rq) + goto unlock; + + /* + * Next, dispatch requests in priority order. Ignore lower priority + * requests if any higher priority requests are pending. + */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); - if (rq) + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); + if (rq || dd_queued(dd, prio)) break; } + +unlock: spin_unlock(&dd->lock); return rq; @@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) dd->async_depth = max(1UL, 3 * q->nr_requests / 4); - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ @@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e) for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; + const struct io_stats_per_prio *stats = &per_prio->stats; + uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); - } - free_percpu(dd->stats); + spin_lock(&dd->lock); + queued = dd_queued(dd, prio); + spin_unlock(&dd->lock); + + WARN_ONCE(queued != 0, + "statistics for priority %d: i %u m %u d %u c %u\n", + prio, stats->inserted, stats->merged, + stats->dispatched, atomic_read(&stats->completed)); + } kfree(dd); } @@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; - dd->stats = alloc_percpu_gfp(typeof(*dd->stats), - GFP_KERNEL | __GFP_ZERO); - if (!dd->stats) - goto free_dd; - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; + dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); q->elevator = eq; return 0; -free_dd: - kfree(dd); - put_eq: kobject_put(&eq->kobj); return ret; @@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq); prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, inserted, prio); - rq->elv.priv[0] = (void *)(uintptr_t)1; + per_prio = &dd->per_prio[prio]; + if (!rq->elv.priv[0]) { + per_prio->stats.inserted++; + rq->elv.priv[0] = (void *)(uintptr_t)1; + } if (blk_mq_sched_try_insert_merge(q, rq, &free)) { blk_mq_free_requests(&free); @@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); - per_prio = &dd->per_prio[prio]; if (at_head) { list_add(&rq->queuelist, &per_prio->dispatch); } else { @@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq) /* * The block layer core may call dd_finish_request() without having - * called dd_insert_requests(). Hence only update statistics for - * requests for which dd_insert_requests() has been called. See also - * blk_mq_request_bypass_insert(). + * called dd_insert_requests(). Skip requests that bypassed I/O + * scheduling. See also blk_mq_request_bypass_insert(). */ - if (rq->elv.priv[0]) - dd_count(dd, completed, prio); + if (!rq->elv.priv[0]) + return; + + atomic_inc(&per_prio->stats.completed); if (blk_queue_is_zoned(q)) { unsigned long flags; @@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->front_merges); @@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); @@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), + DD_ATTR(prio_aging_expire), __ATTR_NULL }; @@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m) return 0; } -/* Number of requests queued for a given priority level. */ -static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) -{ - return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); -} - static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; + u32 rt, be, idle; + + spin_lock(&dd->lock); + rt = dd_queued(dd, DD_RT_PRIO); + be = dd_queued(dd, DD_BE_PRIO); + idle = dd_queued(dd, DD_IDLE_PRIO); + spin_unlock(&dd->lock); + + seq_printf(m, "%u %u %u\n", rt, be, idle); - seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), - dd_queued(dd, DD_BE_PRIO), - dd_queued(dd, DD_IDLE_PRIO)); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { - return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) - - dd_sum(dd, completed, prio); + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; + + lockdep_assert_held(&dd->lock); + + return stats->dispatched + stats->merged - + atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; + u32 rt, be, idle; + + spin_lock(&dd->lock); + rt = dd_owned_by_driver(dd, DD_RT_PRIO); + be = dd_owned_by_driver(dd, DD_BE_PRIO); + idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); + spin_unlock(&dd->lock); + + seq_printf(m, "%u %u %u\n", rt, be, idle); - seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), - dd_owned_by_driver(dd, DD_BE_PRIO), - dd_owned_by_driver(dd, DD_IDLE_PRIO)); return 0; } diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 278593b8e4e9..7aff4eb81c60 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -2,6 +2,8 @@ # # Partition configuration # +menu "Partition Types" + config PARTITION_ADVANCED bool "Advanced partition selection" help @@ -267,3 +269,5 @@ config CMDLINE_PARTITION help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. + +endmenu diff --git a/block/partitions/core.c b/block/partitions/core.c index 58c4c362c94f..334b72ef1d73 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -5,6 +5,7 @@ * Copyright (C) 2020 Christoph Hellwig */ #include <linux/fs.h> +#include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/genhd.h> @@ -90,6 +91,7 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } @@ -203,7 +205,7 @@ static ssize_t part_alignment_offset_show(struct device *dev, struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, + queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits, bdev->bd_start_sect)); } @@ -213,7 +215,7 @@ static ssize_t part_discard_alignment_show(struct device *dev, struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, + queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits, bdev->bd_start_sect)); } @@ -423,6 +425,7 @@ out_del: device_del(pdev); out_put: put_device(pdev); + return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 7ca5c4c374d4..5e9be13a56a8 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len) */ static u64 last_lba(struct gendisk *disk) { - return div_u64(disk->part0->bd_inode->i_size, + return div_u64(bdev_nr_bytes(disk->part0), queue_logical_block_size(disk->queue)) - 1ULL; } diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 9bca396aef4a..403756dbd50d 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, char name[], union label_t *label, sector_t labelsect, - loff_t i_size, + sector_t nr_sectors, dasd_information2_t *info) { loff_t offset, geo_size, size; @@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state, } else { /* * Formated w/o large volume support. If the sanity check - * 'size based on geo == size based on i_size' is true, then + * 'size based on geo == size based on nr_sectors' is true, then * we can safely assume that we know the formatted size of * the disk, otherwise we need additional information * that we can only get from a real DASD device. */ geo_size = geo->cylinders * geo->heads * geo->sectors * secperblk; - size = i_size >> 9; + size = nr_sectors; if (size != geo_size) { if (!info) { strlcat(state->pp_buf, "\n", PAGE_SIZE); @@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, if (!strcmp(info->type, "ECKD")) if (geo_size < size) size = geo_size; - /* else keep size based on i_size */ + /* else keep size based on nr_sectors */ } } /* first and only partition starts in the first block after the label */ @@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state) struct gendisk *disk = state->disk; struct block_device *bdev = disk->part0; int blocksize, res; - loff_t i_size, offset, size; + loff_t offset, size; + sector_t nr_sectors; dasd_information2_t *info; struct hd_geometry *geo; char type[5] = {0,}; @@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state) blocksize = bdev_logical_block_size(bdev); if (blocksize <= 0) goto out_symbol; - i_size = i_size_read(bdev->bd_inode); - if (i_size == 0) + nr_sectors = bdev_nr_sectors(bdev); + if (nr_sectors == 0) goto out_symbol; info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); if (info == NULL) @@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state) label); } else if (!strncmp(type, "LNX1", 4)) { res = find_lnx1_partitions(state, geo, blocksize, name, - label, labelsect, i_size, + label, labelsect, nr_sectors, info); } else if (!strncmp(type, "CMS1", 4)) { res = find_cms1_partitions(state, geo, blocksize, name, @@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state) res = 1; if (info->format == DASD_FORMAT_LDL) { strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); - size = i_size >> 9; + size = nr_sectors; offset = (info->label_block + 1) * (blocksize >> 9); put_partition(state, 1, offset, size-offset); strlcat(state->pp_buf, "\n", PAGE_SIZE); diff --git a/block/t10-pi.c b/block/t10-pi.c index 00c203b2a921..25a52a2a09a8 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -5,7 +5,7 @@ */ #include <linux/t10-pi.h> -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/crc-t10dif.h> #include <linux/module.h> #include <net/checksum.h> diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 8bd288d2b089..3dd5a773c320 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1076,7 +1076,7 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err) af_alg_free_resources(areq); sock_put(sk); - iocb->ki_complete(iocb, err ? err : (int)resultlen, 0); + iocb->ki_complete(iocb, err ? err : (int)resultlen); } EXPORT_SYMBOL_GPL(af_alg_async_cb); diff --git a/drivers/Kconfig b/drivers/Kconfig index 30d2db37cc87..0d399ddaa185 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -17,6 +17,8 @@ source "drivers/bus/Kconfig" source "drivers/connector/Kconfig" +source "drivers/firmware/Kconfig" + source "drivers/gnss/Kconfig" source "drivers/mtd/Kconfig" diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 0a0a982f9c28..c0e77c1c8e09 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -36,7 +36,7 @@ struct acpi_gtdt_descriptor { static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata; -static inline void *next_platform_timer(void *platform_timer) +static inline __init void *next_platform_timer(void *platform_timer) { struct acpi_gtdt_header *gh = platform_timer; diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index b9863e22b952..f0ed4414edb1 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -1035,13 +1035,8 @@ void acpi_turn_off_unused_power_resources(void) list_for_each_entry_reverse(resource, &acpi_power_resource_list, list_node) { mutex_lock(&resource->resource_lock); - /* - * Turn off power resources in an unknown state too, because the - * platform firmware on some system expects the OS to turn off - * power resources without any users unconditionally. - */ if (!resource->ref_count && - resource->state != ACPI_POWER_RESOURCE_STATE_OFF) { + resource->state == ACPI_POWER_RESOURCE_STATE_ON) { acpi_handle_debug(resource->device.handle, "Turning OFF\n"); __acpi_power_off(resource); } diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index f9383736fa0f..71419eb16e09 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -21,6 +21,7 @@ #include <linux/earlycpio.h> #include <linux/initrd.h> #include <linux/security.h> +#include <linux/kmemleak.h> #include "internal.h" #ifdef CONFIG_ACPI_CUSTOM_DSDT @@ -601,6 +602,8 @@ void __init acpi_table_upgrade(void) */ arch_reserve_mem_area(acpi_tables_addr, all_tables_size); + kmemleak_ignore_phys(acpi_tables_addr); + /* * early_ioremap only can remap 256k one time. If we map all * tables one time, we will hit the limit. Need to map chunks diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c index bd92b549fd5a..1c48358b43ba 100644 --- a/drivers/acpi/x86/s2idle.c +++ b/drivers/acpi/x86/s2idle.c @@ -371,7 +371,7 @@ static int lps0_device_attach(struct acpi_device *adev, return 0; if (acpi_s2idle_vendor_amd()) { - /* AMD0004, AMDI0005: + /* AMD0004, AMD0005, AMDI0005: * - Should use rev_id 0x0 * - function mask > 0x3: Should use AMD method, but has off by one bug * - function mask = 0x3: Should use Microsoft method @@ -390,6 +390,7 @@ static int lps0_device_attach(struct acpi_device *adev, ACPI_LPS0_DSM_UUID_MICROSOFT, 0, &lps0_dsm_guid_microsoft); if (lps0_dsm_func_mask > 0x3 && (!strcmp(hid, "AMD0004") || + !strcmp(hid, "AMD0005") || !strcmp(hid, "AMDI0005"))) { lps0_dsm_func_mask = (lps0_dsm_func_mask << 1) | 0x1; acpi_handle_debug(adev->handle, "_DSM UUID %s: Adjusted function mask: 0x%x\n", diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index b2f552088291..0910441321f7 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -440,10 +440,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, hpriv->phy_regulator = devm_regulator_get(dev, "phy"); if (IS_ERR(hpriv->phy_regulator)) { rc = PTR_ERR(hpriv->phy_regulator); - if (rc == -EPROBE_DEFER) - goto err_out; - rc = 0; - hpriv->phy_regulator = NULL; + goto err_out; } if (flags & AHCI_PLATFORM_GET_RESETS) { diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index eed65311b5d1..75f1a6cd6621 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2459,18 +2459,70 @@ static void ata_dev_config_devslp(struct ata_device *dev) } } +static void ata_dev_config_cpr(struct ata_device *dev) +{ + unsigned int err_mask; + size_t buf_len; + int i, nr_cpr = 0; + struct ata_cpr_log *cpr_log = NULL; + u8 *desc, *buf = NULL; + + if (!ata_identify_page_supported(dev, + ATA_LOG_CONCURRENT_POSITIONING_RANGES)) + goto out; + + /* + * Read IDENTIFY DEVICE data log, page 0x47 + * (concurrent positioning ranges). We can have at most 255 32B range + * descriptors plus a 64B header. + */ + buf_len = (64 + 255 * 32 + 511) & ~511; + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) + goto out; + + err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, + ATA_LOG_CONCURRENT_POSITIONING_RANGES, + buf, buf_len >> 9); + if (err_mask) + goto out; + + nr_cpr = buf[0]; + if (!nr_cpr) + goto out; + + cpr_log = kzalloc(struct_size(cpr_log, cpr, nr_cpr), GFP_KERNEL); + if (!cpr_log) + goto out; + + cpr_log->nr_cpr = nr_cpr; + desc = &buf[64]; + for (i = 0; i < nr_cpr; i++, desc += 32) { + cpr_log->cpr[i].num = desc[0]; + cpr_log->cpr[i].num_storage_elements = desc[1]; + cpr_log->cpr[i].start_lba = get_unaligned_le64(&desc[8]); + cpr_log->cpr[i].num_lbas = get_unaligned_le64(&desc[16]); + } + +out: + swap(dev->cpr_log, cpr_log); + kfree(cpr_log); + kfree(buf); +} + static void ata_dev_print_features(struct ata_device *dev) { if (!(dev->flags & ATA_DFLAG_FEATURES_MASK)) return; ata_dev_info(dev, - "Features:%s%s%s%s%s\n", + "Features:%s%s%s%s%s%s\n", dev->flags & ATA_DFLAG_TRUSTED ? " Trust" : "", dev->flags & ATA_DFLAG_DA ? " Dev-Attention" : "", dev->flags & ATA_DFLAG_DEVSLP ? " Dev-Sleep" : "", dev->flags & ATA_DFLAG_NCQ_SEND_RECV ? " NCQ-sndrcv" : "", - dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : ""); + dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : "", + dev->cpr_log ? " CPR" : ""); } /** @@ -2634,6 +2686,7 @@ int ata_dev_configure(struct ata_device *dev) ata_dev_config_sense_reporting(dev); ata_dev_config_zac(dev); ata_dev_config_trusted(dev); + ata_dev_config_cpr(dev); dev->cdb_len = 32; if (ata_msg_drv(ap) && print_info) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 1fb4611f7eeb..15a279f773c7 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1895,7 +1895,7 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf) */ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf) { - int num_pages; + int i, num_pages = 0; static const u8 pages[] = { 0x00, /* page 0x00, this page */ 0x80, /* page 0x80, unit serial no page */ @@ -1905,13 +1905,17 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf) 0xb1, /* page 0xb1, block device characteristics page */ 0xb2, /* page 0xb2, thin provisioning page */ 0xb6, /* page 0xb6, zoned block device characteristics */ + 0xb9, /* page 0xb9, concurrent positioning ranges */ }; - num_pages = sizeof(pages); - if (!(args->dev->flags & ATA_DFLAG_ZAC)) - num_pages--; + for (i = 0; i < sizeof(pages); i++) { + if (pages[i] == 0xb6 && + !(args->dev->flags & ATA_DFLAG_ZAC)) + continue; + rbuf[num_pages + 4] = pages[i]; + num_pages++; + } rbuf[3] = num_pages; /* number of supported VPD pages */ - memcpy(rbuf + 4, pages, num_pages); return 0; } @@ -2121,6 +2125,26 @@ static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf) return 0; } +static unsigned int ata_scsiop_inq_b9(struct ata_scsi_args *args, u8 *rbuf) +{ + struct ata_cpr_log *cpr_log = args->dev->cpr_log; + u8 *desc = &rbuf[64]; + int i; + + /* SCSI Concurrent Positioning Ranges VPD page: SBC-5 rev 1 or later */ + rbuf[1] = 0xb9; + put_unaligned_be16(64 + (int)cpr_log->nr_cpr * 32 - 4, &rbuf[3]); + + for (i = 0; i < cpr_log->nr_cpr; i++, desc += 32) { + desc[0] = cpr_log->cpr[i].num; + desc[1] = cpr_log->cpr[i].num_storage_elements; + put_unaligned_be64(cpr_log->cpr[i].start_lba, &desc[8]); + put_unaligned_be64(cpr_log->cpr[i].num_lbas, &desc[16]); + } + + return 0; +} + /** * modecpy - Prepare response for MODE SENSE * @dest: output buffer @@ -4120,11 +4144,17 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd) ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2); break; case 0xb6: - if (dev->flags & ATA_DFLAG_ZAC) { + if (dev->flags & ATA_DFLAG_ZAC) ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b6); - break; - } - fallthrough; + else + ata_scsi_set_invalid_field(dev, cmd, 2, 0xff); + break; + case 0xb9: + if (dev->cpr_log) + ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b9); + else + ata_scsi_set_invalid_field(dev, cmd, 2, 0xff); + break; default: ata_scsi_set_invalid_field(dev, cmd, 2, 0xff); break; diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c index c3e6592712c4..0a8bf09a5c19 100644 --- a/drivers/ata/pata_legacy.c +++ b/drivers/ata/pata_legacy.c @@ -352,7 +352,8 @@ static unsigned int pdc_data_xfer_vlb(struct ata_queued_cmd *qc, iowrite32_rep(ap->ioaddr.data_addr, buf, buflen >> 2); if (unlikely(slop)) { - __le32 pad; + __le32 pad = 0; + if (rw == READ) { pad = cpu_to_le32(ioread32(ap->ioaddr.data_addr)); memcpy(buf + buflen - slop, &pad, slop); @@ -742,7 +743,8 @@ static unsigned int vlb32_data_xfer(struct ata_queued_cmd *qc, ioread32_rep(ap->ioaddr.data_addr, buf, buflen >> 2); if (unlikely(slop)) { - __le32 pad; + __le32 pad = 0; + if (rw == WRITE) { memcpy(&pad, buf + buflen - slop, slop); iowrite32(le32_to_cpu(pad), ap->ioaddr.data_addr); diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index 9d86203e1e7a..c53633d47bfb 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -3896,8 +3896,8 @@ static int mv_chip_id(struct ata_host *host, unsigned int board_idx) break; default: - dev_err(host->dev, "BUG: invalid board index %u\n", board_idx); - return 1; + dev_alert(host->dev, "BUG: invalid board index %u\n", board_idx); + return -EINVAL; } hpriv->hp_flags = hp_flags; diff --git a/drivers/base/core.c b/drivers/base/core.c index 15986cc2fe5e..249da496581a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -687,7 +687,8 @@ struct device_link *device_link_add(struct device *consumer, { struct device_link *link; - if (!consumer || !supplier || flags & ~DL_ADD_VALID_FLAGS || + if (!consumer || !supplier || consumer == supplier || + flags & ~DL_ADD_VALID_FLAGS || (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) || (flags & DL_FLAG_SYNC_STATE_ONLY && (flags & ~DL_FLAG_INFERRED) != DL_FLAG_SYNC_STATE_ONLY) || diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c index cfa29dc89bbf..fabf87058d80 100644 --- a/drivers/base/regmap/regcache-rbtree.c +++ b/drivers/base/regmap/regcache-rbtree.c @@ -281,14 +281,14 @@ static int regcache_rbtree_insert_to_block(struct regmap *map, if (!blk) return -ENOMEM; + rbnode->block = blk; + if (BITS_TO_LONGS(blklen) > BITS_TO_LONGS(rbnode->blklen)) { present = krealloc(rbnode->cache_present, BITS_TO_LONGS(blklen) * sizeof(*present), GFP_KERNEL); - if (!present) { - kfree(blk); + if (!present) return -ENOMEM; - } memset(present + BITS_TO_LONGS(rbnode->blklen), 0, (BITS_TO_LONGS(blklen) - BITS_TO_LONGS(rbnode->blklen)) @@ -305,7 +305,6 @@ static int regcache_rbtree_insert_to_block(struct regmap *map, } /* update the rbnode block, its size and the base register */ - rbnode->block = blk; rbnode->blklen = blklen; rbnode->base_reg = base_reg; rbnode->cache_present = present; diff --git a/drivers/base/test/Makefile b/drivers/base/test/Makefile index 64b2f3d744d5..7f76fee6f989 100644 --- a/drivers/base/test/Makefile +++ b/drivers/base/test/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_TEST_ASYNC_DRIVER_PROBE) += test_async_driver_probe.o obj-$(CONFIG_DRIVER_PE_KUNIT_TEST) += property-entry-test.o -CFLAGS_REMOVE_property-entry-test.o += -fplugin-arg-structleak_plugin-byref -fplugin-arg-structleak_plugin-byref-all +CFLAGS_property-entry-test.o += $(DISABLE_STRUCTLEAK_PLUGIN) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ab3e37aa1830..d97eaf6adb6d 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -180,14 +180,6 @@ config BLK_DEV_LOOP bits of, say, a sound file). This is also safe if the file resides on a remote file server. - There are several ways of encrypting disks. Some of these require - kernel patches. The vanilla kernel offers the cryptoloop option - and a Device Mapper target (which is superior, as it supports all - file systems). If you want to use the cryptoloop, say Y to both - LOOP and CRYPTOLOOP, and make sure you have a recent (version 2.12 - or later) version of util-linux. Additionally, be aware that - the cryptoloop is not safe for storing journaled filesystems. - Note that this loop device has nothing to do with the loopback device used for network connections from the machine to itself. @@ -211,21 +203,6 @@ config BLK_DEV_LOOP_MIN_COUNT is used, it can be set to 0, since needed loop devices can be dynamically allocated with the /dev/loop-control interface. -config BLK_DEV_CRYPTOLOOP - tristate "Cryptoloop Support (DEPRECATED)" - select CRYPTO - select CRYPTO_CBC - depends on BLK_DEV_LOOP - help - Say Y here if you want to be able to use the ciphers that are - provided by the CryptoAPI as loop transformation. This might be - used as hard disk encryption. - - WARNING: This device is not safe for journaled file systems like - ext3 or Reiserfs. Please use the Device Mapper crypto module - instead, which can be configured to be on-disk compatible with the - cryptoloop device. cryptoloop support will be removed in Linux 5.16. - source "drivers/block/drbd/Kconfig" config BLK_DEV_NBD @@ -304,8 +281,8 @@ config BLK_DEV_RAM_SIZE config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media (DEPRECATED)" depends on !UML + depends on SCSI select CDROM - select SCSI_COMMON help Note: This driver is deprecated and will be removed from the kernel in the near future! diff --git a/drivers/block/Makefile b/drivers/block/Makefile index bc68817ef496..11a74f17c9ad 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -24,7 +24,6 @@ obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o -obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 8b1714021498..bf5c124c5452 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -61,10 +61,10 @@ #include <linux/hdreg.h> #include <linux/delay.h> #include <linux/init.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/fs.h> #include <linux/blk-mq.h> -#include <linux/elevator.h> #include <linux/interrupt.h> #include <linux/platform_device.h> @@ -1780,6 +1780,7 @@ static const struct blk_mq_ops amiflop_mq_ops = { static int fd_alloc_disk(int drive, int system) { struct gendisk *disk; + int err; disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); if (IS_ERR(disk)) @@ -1798,8 +1799,10 @@ static int fd_alloc_disk(int drive, int system) set_capacity(disk, 880 * 2); unit[drive].gendisk[system] = disk; - add_disk(disk); - return 0; + err = add_disk(disk); + if (err) + blk_cleanup_disk(disk); + return err; } static int fd_alloc_drive(int drive) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 06b360f7123a..52484bcdedb9 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -37,8 +37,7 @@ static ssize_t aoedisk_show_state(struct device *dev, struct gendisk *disk = dev_to_disk(dev); struct aoedev *d = disk->private_data; - return snprintf(page, PAGE_SIZE, - "%s%s\n", + return sysfs_emit(page, "%s%s\n", (d->flags & DEVFL_UP) ? "up" : "down", (d->flags & DEVFL_KICKME) ? ",kickme" : (d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : ""); @@ -52,8 +51,8 @@ static ssize_t aoedisk_show_mac(struct device *dev, struct aoetgt *t = d->targets[0]; if (t == NULL) - return snprintf(page, PAGE_SIZE, "none\n"); - return snprintf(page, PAGE_SIZE, "%pm\n", t->addr); + return sysfs_emit(page, "none\n"); + return sysfs_emit(page, "%pm\n", t->addr); } static ssize_t aoedisk_show_netif(struct device *dev, struct device_attribute *attr, char *page) @@ -85,7 +84,7 @@ static ssize_t aoedisk_show_netif(struct device *dev, ne = nd; nd = nds; if (*nd == NULL) - return snprintf(page, PAGE_SIZE, "none\n"); + return sysfs_emit(page, "none\n"); for (p = page; nd < ne; nd++) p += scnprintf(p, PAGE_SIZE - (p-page), "%s%s", p == page ? "" : ",", (*nd)->name); @@ -99,7 +98,7 @@ static ssize_t aoedisk_show_fwver(struct device *dev, struct gendisk *disk = dev_to_disk(dev); struct aoedev *d = disk->private_data; - return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver); + return sysfs_emit(page, "0x%04x\n", (unsigned int) d->fw_ver); } static ssize_t aoedisk_show_payload(struct device *dev, struct device_attribute *attr, char *page) @@ -107,7 +106,7 @@ static ssize_t aoedisk_show_payload(struct device *dev, struct gendisk *disk = dev_to_disk(dev); struct aoedev *d = disk->private_data; - return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt); + return sysfs_emit(page, "%lu\n", d->maxbcnt); } static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) @@ -417,7 +416,9 @@ aoeblk_gdalloc(void *vp) spin_unlock_irqrestore(&d->lock, flags); - device_add_disk(NULL, gd, aoe_attr_groups); + err = device_add_disk(NULL, gd, aoe_attr_groups); + if (err) + goto out_disk_cleanup; aoedisk_add_debugfs(d); spin_lock_irqsave(&d->lock, flags); @@ -426,6 +427,8 @@ aoeblk_gdalloc(void *vp) spin_unlock_irqrestore(&d->lock, flags); return; +out_disk_cleanup: + blk_cleanup_disk(gd); err_tagset: blk_mq_free_tag_set(set); err_mempool: diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index a093644ac39f..d14bdc3589b2 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -68,6 +68,7 @@ #include <linux/delay.h> #include <linux/init.h> #include <linux/blk-mq.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/completion.h> #include <linux/wait.h> @@ -298,6 +299,7 @@ static struct atari_floppy_struct { disk change detection) */ int flags; /* flags */ struct gendisk *disk[NUM_DISK_MINORS]; + bool registered[NUM_DISK_MINORS]; int ref; int type; struct blk_mq_tag_set tag_set; @@ -456,10 +458,20 @@ static DEFINE_TIMER(fd_timer, check_change); static void fd_end_request_cur(blk_status_t err) { + DPRINT(("fd_end_request_cur(), bytes %d of %d\n", + blk_rq_cur_bytes(fd_request), + blk_rq_bytes(fd_request))); + if (!blk_update_request(fd_request, err, blk_rq_cur_bytes(fd_request))) { + DPRINT(("calling __blk_mq_end_request()\n")); __blk_mq_end_request(fd_request, err); fd_request = NULL; + } else { + /* requeue rest of request */ + DPRINT(("calling blk_mq_requeue_request()\n")); + blk_mq_requeue_request(fd_request, true); + fd_request = NULL; } } @@ -653,9 +665,6 @@ static inline void copy_buffer(void *from, void *to) *p2++ = *p1++; } - - - /* General Interrupt Handling */ static void (*FloppyIRQHandler)( int status ) = NULL; @@ -700,12 +709,21 @@ static void fd_error( void ) if (fd_request->error_count >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); fd_end_request_cur(BLK_STS_IOERR); + finish_fdc(); + return; } else if (fd_request->error_count == RECALIBRATE_ERRORS) { printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); if (SelectedDrive != -1) SUD.track = -1; } + /* need to re-run request to recalibrate */ + atari_disable_irq( IRQ_MFP_FDC ); + + setup_req_params( SelectedDrive ); + do_fd_action( SelectedDrive ); + + atari_enable_irq( IRQ_MFP_FDC ); } @@ -732,8 +750,10 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) if (type) { type--; if (type >= NUM_DISK_MINORS || - minor2disktype[type].drive_types > DriveType) + minor2disktype[type].drive_types > DriveType) { + finish_fdc(); return -EINVAL; + } } q = unit[drive].disk[type]->queue; @@ -751,6 +771,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) } if (!UDT || desc->track >= UDT->blocks/UDT->spt/2 || desc->head >= 2) { + finish_fdc(); ret = -EINVAL; goto out; } @@ -791,6 +812,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) wait_for_completion(&format_wait); + finish_fdc(); ret = FormatError ? -EIO : 0; out: blk_mq_unquiesce_queue(q); @@ -825,6 +847,7 @@ static void do_fd_action( int drive ) else { /* all sectors finished */ fd_end_request_cur(BLK_STS_OK); + finish_fdc(); return; } } @@ -1229,6 +1252,7 @@ static void fd_rwsec_done1(int status) else { /* all sectors finished */ fd_end_request_cur(BLK_STS_OK); + finish_fdc(); } return; @@ -1350,7 +1374,7 @@ static void fd_times_out(struct timer_list *unused) static void finish_fdc( void ) { - if (!NeedSeek) { + if (!NeedSeek || !stdma_is_locked_by(floppy_irq)) { finish_fdc_done( 0 ); } else { @@ -1385,7 +1409,8 @@ static void finish_fdc_done( int dummy ) start_motor_off_timer(); local_irq_save(flags); - stdma_release(); + if (stdma_is_locked_by(floppy_irq)) + stdma_release(); local_irq_restore(flags); DPRINT(("finish_fdc() finished\n")); @@ -1435,8 +1460,7 @@ static int floppy_revalidate(struct gendisk *disk) unsigned int drive = p - unit; if (test_bit(drive, &changed_floppies) || - test_bit(drive, &fake_change) || - p->disktype == 0) { + test_bit(drive, &fake_change) || !p->disktype) { if (UD.flags & FTD_MSG) printk(KERN_ERR "floppy: clear format %p!\n", UDT); BufferDrive = -1; @@ -1475,15 +1499,6 @@ static void setup_req_params( int drive ) ReqTrack, ReqSector, (unsigned long)ReqData )); } -static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx) -{ - spin_lock_irq(&ataflop_lock); - atari_disable_irq(IRQ_MFP_FDC); - finish_fdc(); - atari_enable_irq(IRQ_MFP_FDC); - spin_unlock_irq(&ataflop_lock); -} - static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1491,6 +1506,10 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, int drive = floppy - unit; int type = floppy->type; + DPRINT(("Queue request: drive %d type %d sectors %d of %d last %d\n", + drive, type, blk_rq_cur_sectors(bd->rq), + blk_rq_sectors(bd->rq), bd->last)); + spin_lock_irq(&ataflop_lock); if (fd_request) { spin_unlock_irq(&ataflop_lock); @@ -1511,6 +1530,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, /* drive not connected */ printk(KERN_ERR "Unknown Device: fd%d\n", drive ); fd_end_request_cur(BLK_STS_IOERR); + stdma_release(); goto out; } @@ -1527,11 +1547,13 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, if (--type >= NUM_DISK_MINORS) { printk(KERN_WARNING "fd%d: invalid disk format", drive ); fd_end_request_cur(BLK_STS_IOERR); + stdma_release(); goto out; } if (minor2disktype[type].drive_types > DriveType) { printk(KERN_WARNING "fd%d: unsupported disk format", drive ); fd_end_request_cur(BLK_STS_IOERR); + stdma_release(); goto out; } type = minor2disktype[type].index; @@ -1550,8 +1572,6 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, setup_req_params( drive ); do_fd_action( drive ); - if (bd->last) - finish_fdc(); atari_enable_irq( IRQ_MFP_FDC ); out: @@ -1634,6 +1654,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, /* what if type > 0 here? Overwrite specified entry ? */ if (type) { /* refuse to re-set a predefined type for now */ + finish_fdc(); return -EINVAL; } @@ -1701,8 +1722,10 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, /* sanity check */ if (setprm.track != dtp->blocks/dtp->spt/2 || - setprm.head != 2) + setprm.head != 2) { + finish_fdc(); return -EINVAL; + } UDT = dtp; set_capacity(disk, UDT->blocks); @@ -1962,7 +1985,6 @@ static const struct block_device_operations floppy_fops = { static const struct blk_mq_ops ataflop_mq_ops = { .queue_rq = ataflop_queue_rq, - .commit_rqs = ataflop_commit_rqs, }; static int ataflop_alloc_disk(unsigned int drive, unsigned int type) @@ -2000,12 +2022,28 @@ static void ataflop_probe(dev_t dev) return; mutex_lock(&ataflop_probe_lock); if (!unit[drive].disk[type]) { - if (ataflop_alloc_disk(drive, type) == 0) + if (ataflop_alloc_disk(drive, type) == 0) { add_disk(unit[drive].disk[type]); + unit[drive].registered[type] = true; + } } mutex_unlock(&ataflop_probe_lock); } +static void atari_cleanup_floppy_disk(struct atari_floppy_struct *fs) +{ + int type; + + for (type = 0; type < NUM_DISK_MINORS; type++) { + if (!fs->disk[type]) + continue; + if (fs->registered[type]) + del_gendisk(fs->disk[type]); + blk_cleanup_disk(fs->disk[type]); + } + blk_mq_free_tag_set(&fs->tag_set); +} + static int __init atari_floppy_init (void) { int i; @@ -2064,7 +2102,10 @@ static int __init atari_floppy_init (void) for (i = 0; i < FD_MAX_UNITS; i++) { unit[i].track = -1; unit[i].flags = 0; - add_disk(unit[i].disk[0]); + ret = add_disk(unit[i].disk[0]); + if (ret) + goto err_out_dma; + unit[i].registered[0] = true; } printk(KERN_INFO "Atari floppy driver: max. %cD, %strack buffering\n", @@ -2074,12 +2115,11 @@ static int __init atari_floppy_init (void) return 0; +err_out_dma: + atari_stram_free(DMABuffer); err: - while (--i >= 0) { - blk_cleanup_queue(unit[i].disk[0]->queue); - put_disk(unit[i].disk[0]); - blk_mq_free_tag_set(&unit[i].tag_set); - } + while (--i >= 0) + atari_cleanup_floppy_disk(&unit[i]); unregister_blkdev(FLOPPY_MAJOR, "fd"); out_unlock: @@ -2128,18 +2168,10 @@ __setup("floppy=", atari_floppy_setup); static void __exit atari_floppy_exit(void) { - int i, type; + int i; - for (i = 0; i < FD_MAX_UNITS; i++) { - for (type = 0; type < NUM_DISK_MINORS; type++) { - if (!unit[i].disk[type]) - continue; - del_gendisk(unit[i].disk[type]); - blk_cleanup_queue(unit[i].disk[type]->queue); - put_disk(unit[i].disk[type]); - } - blk_mq_free_tag_set(&unit[i].tag_set); - } + for (i = 0; i < FD_MAX_UNITS; i++) + atari_cleanup_floppy_disk(&unit[i]); unregister_blkdev(FLOPPY_MAJOR, "fd"); del_timer_sync(&fd_timer); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 58ec167aa018..aa0472718dce 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -282,7 +282,7 @@ out: return err; } -static blk_qc_t brd_submit_bio(struct bio *bio) +static void brd_submit_bio(struct bio *bio) { struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; sector_t sector = bio->bi_iter.bi_sector; @@ -299,16 +299,14 @@ static blk_qc_t brd_submit_bio(struct bio *bio) err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, bio_op(bio), sector); - if (err) - goto io_error; + if (err) { + bio_io_error(bio); + return; + } sector += len >> SECTOR_SHIFT; } bio_endio(bio); - return BLK_QC_T_NONE; -io_error: - bio_io_error(bio); - return BLK_QC_T_NONE; } static int brd_rw_page(struct block_device *bdev, sector_t sector, @@ -373,10 +371,22 @@ static int brd_alloc(int i) struct gendisk *disk; char buf[DISK_NAME_LEN]; + mutex_lock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) { + if (brd->brd_number == i) { + mutex_unlock(&brd_devices_mutex); + return -EEXIST; + } + } brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) + if (!brd) { + mutex_unlock(&brd_devices_mutex); return -ENOMEM; + } brd->brd_number = i; + list_add_tail(&brd->brd_list, &brd_devices); + mutex_unlock(&brd_devices_mutex); + spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); @@ -411,37 +421,30 @@ static int brd_alloc(int i) blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); add_disk(disk); - list_add_tail(&brd->brd_list, &brd_devices); return 0; out_free_dev: + mutex_lock(&brd_devices_mutex); + list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); kfree(brd); return -ENOMEM; } static void brd_probe(dev_t dev) { - int i = MINOR(dev) / max_part; - struct brd_device *brd; - - mutex_lock(&brd_devices_mutex); - list_for_each_entry(brd, &brd_devices, brd_list) { - if (brd->brd_number == i) - goto out_unlock; - } - - brd_alloc(i); -out_unlock: - mutex_unlock(&brd_devices_mutex); + brd_alloc(MINOR(dev) / max_part); } static void brd_del_one(struct brd_device *brd) { - list_del(&brd->brd_list); del_gendisk(brd->brd_disk); blk_cleanup_disk(brd->brd_disk); brd_free_pages(brd); + mutex_lock(&brd_devices_mutex); + list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); kfree(brd); } @@ -491,25 +494,21 @@ static int __init brd_init(void) brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - mutex_lock(&brd_devices_mutex); for (i = 0; i < rd_nr; i++) { err = brd_alloc(i); if (err) goto out_free; } - mutex_unlock(&brd_devices_mutex); - pr_info("brd: module loaded\n"); return 0; out_free: + unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); debugfs_remove_recursive(brd_debugfs_dir); list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - mutex_unlock(&brd_devices_mutex); - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); pr_info("brd: module NOT loaded !!!\n"); return err; @@ -519,13 +518,12 @@ static void __exit brd_exit(void) { struct brd_device *brd, *next; + unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); debugfs_remove_recursive(brd_debugfs_dir); list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - pr_info("brd: module unloaded\n"); } diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c deleted file mode 100644 index f0a91faa43a8..000000000000 --- a/drivers/block/cryptoloop.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - Linux loop encryption enabling module - - Copyright (C) 2002 Herbert Valerio Riedel <hvr@gnu.org> - Copyright (C) 2003 Fruhwirth Clemens <clemens@endorphin.org> - - */ - -#include <linux/module.h> - -#include <crypto/skcipher.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/blkdev.h> -#include <linux/scatterlist.h> -#include <linux/uaccess.h> -#include "loop.h" - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("loop blockdevice transferfunction adaptor / CryptoAPI"); -MODULE_AUTHOR("Herbert Valerio Riedel <hvr@gnu.org>"); - -#define LOOP_IV_SECTOR_BITS 9 -#define LOOP_IV_SECTOR_SIZE (1 << LOOP_IV_SECTOR_BITS) - -static int -cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) -{ - int err = -EINVAL; - int cipher_len; - int mode_len; - char cms[LO_NAME_SIZE]; /* cipher-mode string */ - char *mode; - char *cmsp = cms; /* c-m string pointer */ - struct crypto_sync_skcipher *tfm; - - /* encryption breaks for non sector aligned offsets */ - - if (info->lo_offset % LOOP_IV_SECTOR_SIZE) - goto out; - - strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE); - cms[LO_NAME_SIZE - 1] = 0; - - cipher_len = strcspn(cmsp, "-"); - - mode = cmsp + cipher_len; - mode_len = 0; - if (*mode) { - mode++; - mode_len = strcspn(mode, "-"); - } - - if (!mode_len) { - mode = "cbc"; - mode_len = 3; - } - - if (cipher_len + mode_len + 3 > LO_NAME_SIZE) - return -EINVAL; - - memmove(cms, mode, mode_len); - cmsp = cms + mode_len; - *cmsp++ = '('; - memcpy(cmsp, info->lo_crypt_name, cipher_len); - cmsp += cipher_len; - *cmsp++ = ')'; - *cmsp = 0; - - tfm = crypto_alloc_sync_skcipher(cms, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - err = crypto_sync_skcipher_setkey(tfm, info->lo_encrypt_key, - info->lo_encrypt_key_size); - - if (err != 0) - goto out_free_tfm; - - lo->key_data = tfm; - return 0; - - out_free_tfm: - crypto_free_sync_skcipher(tfm); - - out: - return err; -} - - -typedef int (*encdec_cbc_t)(struct skcipher_request *req); - -static int -cryptoloop_transfer(struct loop_device *lo, int cmd, - struct page *raw_page, unsigned raw_off, - struct page *loop_page, unsigned loop_off, - int size, sector_t IV) -{ - struct crypto_sync_skcipher *tfm = lo->key_data; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - struct scatterlist sg_out; - struct scatterlist sg_in; - - encdec_cbc_t encdecfunc; - struct page *in_page, *out_page; - unsigned in_offs, out_offs; - int err; - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, - NULL, NULL); - - sg_init_table(&sg_out, 1); - sg_init_table(&sg_in, 1); - - if (cmd == READ) { - in_page = raw_page; - in_offs = raw_off; - out_page = loop_page; - out_offs = loop_off; - encdecfunc = crypto_skcipher_decrypt; - } else { - in_page = loop_page; - in_offs = loop_off; - out_page = raw_page; - out_offs = raw_off; - encdecfunc = crypto_skcipher_encrypt; - } - - while (size > 0) { - const int sz = min(size, LOOP_IV_SECTOR_SIZE); - u32 iv[4] = { 0, }; - iv[0] = cpu_to_le32(IV & 0xffffffff); - - sg_set_page(&sg_in, in_page, sz, in_offs); - sg_set_page(&sg_out, out_page, sz, out_offs); - - skcipher_request_set_crypt(req, &sg_in, &sg_out, sz, iv); - err = encdecfunc(req); - if (err) - goto out; - - IV++; - size -= sz; - in_offs += sz; - out_offs += sz; - } - - err = 0; - -out: - skcipher_request_zero(req); - return err; -} - -static int -cryptoloop_ioctl(struct loop_device *lo, int cmd, unsigned long arg) -{ - return -EINVAL; -} - -static int -cryptoloop_release(struct loop_device *lo) -{ - struct crypto_sync_skcipher *tfm = lo->key_data; - if (tfm != NULL) { - crypto_free_sync_skcipher(tfm); - lo->key_data = NULL; - return 0; - } - printk(KERN_ERR "cryptoloop_release(): tfm == NULL?\n"); - return -EINVAL; -} - -static struct loop_func_table cryptoloop_funcs = { - .number = LO_CRYPT_CRYPTOAPI, - .init = cryptoloop_init, - .ioctl = cryptoloop_ioctl, - .transfer = cryptoloop_transfer, - .release = cryptoloop_release, - .owner = THIS_MODULE -}; - -static int __init -init_cryptoloop(void) -{ - int rc = loop_register_transfer(&cryptoloop_funcs); - - if (rc) - printk(KERN_ERR "cryptoloop: loop_register_transfer failed\n"); - else - pr_warn("the cryptoloop driver has been deprecated and will be removed in in Linux 5.16\n"); - return rc; -} - -static void __exit -cleanup_cryptoloop(void) -{ - if (loop_unregister_transfer(LO_CRYPT_CRYPTOAPI)) - printk(KERN_ERR - "cryptoloop: loop_unregister_transfer failed\n"); -} - -module_init(init_cryptoloop); -module_exit(cleanup_cryptoloop); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 5d9181382ce1..f27d5b0f9a0b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1448,7 +1448,7 @@ extern void conn_free_crypto(struct drbd_connection *connection); /* drbd_req */ extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *); -extern blk_qc_t drbd_submit_bio(struct bio *bio); +void drbd_submit_bio(struct bio *bio); extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); extern int is_valid_ar_handle(struct drbd_request *, sector_t); @@ -1826,8 +1826,7 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) /* Returns the number of 512 byte sectors of the device */ static inline sector_t drbd_get_capacity(struct block_device *bdev) { - /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ - return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0; + return bdev ? bdev_nr_sectors(bdev) : 0; } /** diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 55234a558e98..19db80a1e409 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2794,7 +2794,9 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig goto out_idr_remove_vol; } - add_disk(disk); + err = add_disk(disk); + if (err) + goto out_cleanup_disk; /* inherit the connection state */ device->state.conn = first_connection(resource)->cstate; @@ -2808,6 +2810,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_debugfs_device_add(device); return NO_ERROR; +out_cleanup_disk: + blk_cleanup_disk(disk); out_idr_remove_vol: idr_remove(&connection->peer_devices, vnr); out_idr_remove_from_resource: diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 5ca233644d70..3235532ae077 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1596,7 +1596,7 @@ void do_submit(struct work_struct *ws) } } -blk_qc_t drbd_submit_bio(struct bio *bio) +void drbd_submit_bio(struct bio *bio) { struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; @@ -1609,7 +1609,6 @@ blk_qc_t drbd_submit_bio(struct bio *bio) inc_ap_bio(device); __drbd_make_request(device, bio); - return BLK_QC_T_NONE; } static bool net_timeout_reached(struct drbd_request *net_req, diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index fef79ea52e3e..3873e789478e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -184,6 +184,7 @@ static int print_unex = 1; #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/major.h> #include <linux/platform_device.h> #include <linux/mod_devicetable.h> #include <linux/mutex.h> @@ -4478,6 +4479,7 @@ static const struct blk_mq_ops floppy_mq_ops = { }; static struct platform_device floppy_device[N_DRIVE]; +static bool registered[N_DRIVE]; static bool floppy_available(int drive) { @@ -4693,8 +4695,12 @@ static int __init do_floppy_init(void) if (err) goto out_remove_drives; - device_add_disk(&floppy_device[drive].dev, disks[drive][0], - NULL); + registered[drive] = true; + + err = device_add_disk(&floppy_device[drive].dev, + disks[drive][0], NULL); + if (err) + goto out_remove_drives; } return 0; @@ -4703,7 +4709,8 @@ out_remove_drives: while (drive--) { if (floppy_available(drive)) { del_gendisk(disks[drive][0]); - platform_device_unregister(&floppy_device[drive]); + if (registered[drive]) + platform_device_unregister(&floppy_device[drive]); } } out_release_dma: @@ -4946,30 +4953,14 @@ static void __exit floppy_module_exit(void) if (disks[drive][i]) del_gendisk(disks[drive][i]); } - platform_device_unregister(&floppy_device[drive]); + if (registered[drive]) + platform_device_unregister(&floppy_device[drive]); } for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { if (disks[drive][i]) - blk_cleanup_queue(disks[drive][i]->queue); + blk_cleanup_disk(disks[drive][i]); } blk_mq_free_tag_set(&tag_sets[drive]); - - /* - * These disks have not called add_disk(). Don't put down - * queue reference in put_disk(). - */ - if (!(allowed_drive_mask & (1 << drive)) || - fdc_state[FDC(drive)].version == FDC_NONE) { - for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { - if (disks[drive][i]) - disks[drive][i]->queue = NULL; - } - } - - for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { - if (disks[drive][i]) - put_disk(disks[drive][i]); - } } cancel_delayed_work_sync(&fd_timeout); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 7bf4686af774..3c09a33fa1c7 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -133,58 +133,6 @@ static void loop_global_unlock(struct loop_device *lo, bool global) static int max_part; static int part_shift; -static int transfer_xor(struct loop_device *lo, int cmd, - struct page *raw_page, unsigned raw_off, - struct page *loop_page, unsigned loop_off, - int size, sector_t real_block) -{ - char *raw_buf = kmap_atomic(raw_page) + raw_off; - char *loop_buf = kmap_atomic(loop_page) + loop_off; - char *in, *out, *key; - int i, keysize; - - if (cmd == READ) { - in = raw_buf; - out = loop_buf; - } else { - in = loop_buf; - out = raw_buf; - } - - key = lo->lo_encrypt_key; - keysize = lo->lo_encrypt_key_size; - for (i = 0; i < size; i++) - *out++ = *in++ ^ key[(i & 511) % keysize]; - - kunmap_atomic(loop_buf); - kunmap_atomic(raw_buf); - cond_resched(); - return 0; -} - -static int xor_init(struct loop_device *lo, const struct loop_info64 *info) -{ - if (unlikely(info->lo_encrypt_key_size <= 0)) - return -EINVAL; - return 0; -} - -static struct loop_func_table none_funcs = { - .number = LO_CRYPT_NONE, -}; - -static struct loop_func_table xor_funcs = { - .number = LO_CRYPT_XOR, - .transfer = transfer_xor, - .init = xor_init -}; - -/* xfer_funcs[0] is special - its release function is never called */ -static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { - &none_funcs, - &xor_funcs -}; - static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) { loff_t loopsize; @@ -228,8 +176,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) /* * We support direct I/O only if lo_offset is aligned with the * logical I/O size of backing device, and the logical block - * size of loop is bigger than the backing device's and the loop - * needn't transform transfer. + * size of loop is bigger than the backing device's. * * TODO: the above condition may be loosed in the future, and * direct I/O may be switched runtime at that time because most @@ -238,8 +185,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) if (dio) { if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && !(lo->lo_offset & dio_align) && - mapping->a_ops->direct_IO && - !lo->transfer) + mapping->a_ops->direct_IO) use_dio = true; else use_dio = false; @@ -273,19 +219,6 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) } /** - * loop_validate_block_size() - validates the passed in block size - * @bsize: size to validate - */ -static int -loop_validate_block_size(unsigned short bsize) -{ - if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) - return -EINVAL; - - return 0; -} - -/** * loop_set_size() - sets device size and notifies userspace * @lo: struct loop_device to set the size for * @size: new size of the loop device @@ -299,24 +232,6 @@ static void loop_set_size(struct loop_device *lo, loff_t size) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } -static inline int -lo_do_transfer(struct loop_device *lo, int cmd, - struct page *rpage, unsigned roffs, - struct page *lpage, unsigned loffs, - int size, sector_t rblock) -{ - int ret; - - ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); - if (likely(!ret)) - return 0; - - printk_ratelimited(KERN_ERR - "loop: Transfer error at byte offset %llu, length %i.\n", - (unsigned long long)rblock << 9, size); - return ret; -} - static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) { struct iov_iter i; @@ -356,41 +271,6 @@ static int lo_write_simple(struct loop_device *lo, struct request *rq, return ret; } -/* - * This is the slow, transforming version that needs to double buffer the - * data as it cannot do the transformations in place without having direct - * access to the destination pages of the backing file. - */ -static int lo_write_transfer(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec, b; - struct req_iterator iter; - struct page *page; - int ret = 0; - - page = alloc_page(GFP_NOIO); - if (unlikely(!page)) - return -ENOMEM; - - rq_for_each_segment(bvec, rq, iter) { - ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page, - bvec.bv_offset, bvec.bv_len, pos >> 9); - if (unlikely(ret)) - break; - - b.bv_page = page; - b.bv_offset = 0; - b.bv_len = bvec.bv_len; - ret = lo_write_bvec(lo->lo_backing_file, &b, &pos); - if (ret < 0) - break; - } - - __free_page(page); - return ret; -} - static int lo_read_simple(struct loop_device *lo, struct request *rq, loff_t pos) { @@ -420,64 +300,12 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq, return 0; } -static int lo_read_transfer(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec, b; - struct req_iterator iter; - struct iov_iter i; - struct page *page; - ssize_t len; - int ret = 0; - - page = alloc_page(GFP_NOIO); - if (unlikely(!page)) - return -ENOMEM; - - rq_for_each_segment(bvec, rq, iter) { - loff_t offset = pos; - - b.bv_page = page; - b.bv_offset = 0; - b.bv_len = bvec.bv_len; - - iov_iter_bvec(&i, READ, &b, 1, b.bv_len); - len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); - if (len < 0) { - ret = len; - goto out_free_page; - } - - ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page, - bvec.bv_offset, len, offset >> 9); - if (ret) - goto out_free_page; - - flush_dcache_page(bvec.bv_page); - - if (len != bvec.bv_len) { - struct bio *bio; - - __rq_for_each_bio(bio, rq) - zero_fill_bio(bio); - break; - } - } - - ret = 0; -out_free_page: - __free_page(page); - return ret; -} - static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, int mode) { /* * We use fallocate to manipulate the space mappings used by the image - * a.k.a. discard/zerorange. However we do not support this if - * encryption is enabled, because it may give an attacker useful - * information. + * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; struct request_queue *q = lo->lo_queue; @@ -554,7 +382,7 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) blk_mq_complete_request(rq); } -static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) +static void lo_rw_aio_complete(struct kiocb *iocb, long ret) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); @@ -627,7 +455,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, lo_rw_aio_do_completion(cmd); if (ret != -EIOCBQUEUED) - cmd->iocb.ki_complete(&cmd->iocb, ret, 0); + lo_rw_aio_complete(&cmd->iocb, ret); return 0; } @@ -660,16 +488,12 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) case REQ_OP_DISCARD: return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: - if (lo->transfer) - return lo_write_transfer(lo, rq, pos); - else if (cmd->use_aio) + if (cmd->use_aio) return lo_rw_aio(lo, cmd, pos, WRITE); else return lo_write_simple(lo, rq, pos); case REQ_OP_READ: - if (lo->transfer) - return lo_read_transfer(lo, rq, pos); - else if (cmd->use_aio) + if (cmd->use_aio) return lo_rw_aio(lo, cmd, pos, READ); else return lo_read_simple(lo, rq, pos); @@ -934,7 +758,7 @@ static void loop_config_discard(struct loop_device *lo) * not blkdev_issue_discard(). This maintains consistent behavior with * file-backed loop devices: discarded regions read back as zero. */ - if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) { + if (S_ISBLK(inode->i_mode)) { struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; @@ -943,11 +767,9 @@ static void loop_config_discard(struct loop_device *lo) /* * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. + * image a.k.a. discard. */ - } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) { + } else if (!file->f_op->fallocate) { max_discard_sectors = 0; granularity = 0; @@ -1084,43 +906,6 @@ static void loop_update_rotational(struct loop_device *lo) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); } -static int -loop_release_xfer(struct loop_device *lo) -{ - int err = 0; - struct loop_func_table *xfer = lo->lo_encryption; - - if (xfer) { - if (xfer->release) - err = xfer->release(lo); - lo->transfer = NULL; - lo->lo_encryption = NULL; - module_put(xfer->owner); - } - return err; -} - -static int -loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, - const struct loop_info64 *i) -{ - int err = 0; - - if (xfer) { - struct module *owner = xfer->owner; - - if (!try_module_get(owner)) - return -EINVAL; - if (xfer->init) - err = xfer->init(lo, i); - if (err) - module_put(owner); - else - lo->lo_encryption = xfer; - } - return err; -} - /** * loop_set_status_from_info - configure device from loop_info * @lo: struct loop_device to configure @@ -1133,55 +918,27 @@ static int loop_set_status_from_info(struct loop_device *lo, const struct loop_info64 *info) { - int err; - struct loop_func_table *xfer; - kuid_t uid = current_uid(); - if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; - err = loop_release_xfer(lo); - if (err) - return err; - - if (info->lo_encrypt_type) { - unsigned int type = info->lo_encrypt_type; - - if (type >= MAX_LO_CRYPT) - return -EINVAL; - xfer = xfer_funcs[type]; - if (xfer == NULL) - return -EINVAL; - } else - xfer = NULL; - - err = loop_init_xfer(lo, xfer, info); - if (err) - return err; + switch (info->lo_encrypt_type) { + case LO_CRYPT_NONE: + break; + case LO_CRYPT_XOR: + pr_warn("support for the xor transformation has been removed.\n"); + return -EINVAL; + case LO_CRYPT_CRYPTOAPI: + pr_warn("support for cryptoloop has been removed. Use dm-crypt instead.\n"); + return -EINVAL; + default: + return -EINVAL; + } lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); - memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; - lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; - - if (!xfer) - xfer = &none_funcs; - lo->transfer = xfer->transfer; - lo->ioctl = xfer->ioctl; - lo->lo_flags = info->lo_flags; - - lo->lo_encrypt_key_size = info->lo_encrypt_key_size; - lo->lo_init[0] = info->lo_init[0]; - lo->lo_init[1] = info->lo_init[1]; - if (info->lo_encrypt_key_size) { - memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, - info->lo_encrypt_key_size); - lo->lo_key_owner = uid; - } - return 0; } @@ -1236,7 +993,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, } if (config->block_size) { - error = loop_validate_block_size(config->block_size); + error = blk_validate_block_size(config->block_size); if (error) goto out_unlock; } @@ -1329,7 +1086,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) { struct file *filp = NULL; gfp_t gfp = lo->old_gfp_mask; - struct block_device *bdev = lo->lo_device; int err = 0; bool partscan = false; int lo_number; @@ -1381,36 +1137,23 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); - loop_release_xfer(lo); - lo->transfer = NULL; - lo->ioctl = NULL; lo->lo_device = NULL; - lo->lo_encryption = NULL; lo->lo_offset = 0; lo->lo_sizelimit = 0; - lo->lo_encrypt_key_size = 0; - memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); - memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); blk_queue_logical_block_size(lo->lo_queue, 512); blk_queue_physical_block_size(lo->lo_queue, 512); blk_queue_io_min(lo->lo_queue, 512); - if (bdev) { - invalidate_bdev(bdev); - bdev->bd_inode->i_mapping->wb_err = 0; - } - set_capacity(lo->lo_disk, 0); + invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); - if (bdev) { - /* let user-space know about this change */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - } + /* let user-space know about this change */ + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); blk_mq_unfreeze_queue(lo->lo_queue); - partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; + partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; lo_number = lo->lo_number; disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); out_unlock: @@ -1498,7 +1241,6 @@ static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; - kuid_t uid = current_uid(); int prev_lo_flags; bool partscan = false; bool size_changed = false; @@ -1506,12 +1248,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; - if (lo->lo_encrypt_key_size && - !uid_eq(lo->lo_key_owner, uid) && - !capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out_unlock; - } if (lo->lo_state != Lo_bound) { err = -ENXIO; goto out_unlock; @@ -1597,14 +1333,6 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) info->lo_sizelimit = lo->lo_sizelimit; info->lo_flags = lo->lo_flags; memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); - memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); - info->lo_encrypt_type = - lo->lo_encryption ? lo->lo_encryption->number : 0; - if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { - info->lo_encrypt_key_size = lo->lo_encrypt_key_size; - memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, - lo->lo_encrypt_key_size); - } /* Drop lo_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; @@ -1630,16 +1358,8 @@ loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) info64->lo_rdevice = info->lo_rdevice; info64->lo_offset = info->lo_offset; info64->lo_sizelimit = 0; - info64->lo_encrypt_type = info->lo_encrypt_type; - info64->lo_encrypt_key_size = info->lo_encrypt_key_size; info64->lo_flags = info->lo_flags; - info64->lo_init[0] = info->lo_init[0]; - info64->lo_init[1] = info->lo_init[1]; - if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); - else - memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); - memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); } static int @@ -1651,16 +1371,8 @@ loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) info->lo_inode = info64->lo_inode; info->lo_rdevice = info64->lo_rdevice; info->lo_offset = info64->lo_offset; - info->lo_encrypt_type = info64->lo_encrypt_type; - info->lo_encrypt_key_size = info64->lo_encrypt_key_size; info->lo_flags = info64->lo_flags; - info->lo_init[0] = info64->lo_init[0]; - info->lo_init[1] = info64->lo_init[1]; - if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE); - else - memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); - memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info->lo_device != info64->lo_device || @@ -1759,7 +1471,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) if (lo->lo_state != Lo_bound) return -ENXIO; - err = loop_validate_block_size(arg); + err = blk_validate_block_size(arg); if (err) return err; @@ -1809,7 +1521,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, err = loop_set_block_size(lo, arg); break; default: - err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; + err = -EINVAL; } mutex_unlock(&lo->lo_mutex); return err; @@ -1885,7 +1597,6 @@ struct compat_loop_info { compat_ulong_t lo_inode; /* ioctl r/o */ compat_dev_t lo_rdevice; /* ioctl r/o */ compat_int_t lo_offset; - compat_int_t lo_encrypt_type; compat_int_t lo_encrypt_key_size; /* ioctl w/o */ compat_int_t lo_flags; /* ioctl r/o */ char lo_name[LO_NAME_SIZE]; @@ -1914,16 +1625,8 @@ loop_info64_from_compat(const struct compat_loop_info __user *arg, info64->lo_rdevice = info.lo_rdevice; info64->lo_offset = info.lo_offset; info64->lo_sizelimit = 0; - info64->lo_encrypt_type = info.lo_encrypt_type; - info64->lo_encrypt_key_size = info.lo_encrypt_key_size; info64->lo_flags = info.lo_flags; - info64->lo_init[0] = info.lo_init[0]; - info64->lo_init[1] = info.lo_init[1]; - if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE); - else - memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); - memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE); + memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); return 0; } @@ -1943,24 +1646,14 @@ loop_info64_to_compat(const struct loop_info64 *info64, info.lo_inode = info64->lo_inode; info.lo_rdevice = info64->lo_rdevice; info.lo_offset = info64->lo_offset; - info.lo_encrypt_type = info64->lo_encrypt_type; - info.lo_encrypt_key_size = info64->lo_encrypt_key_size; info.lo_flags = info64->lo_flags; - info.lo_init[0] = info64->lo_init[0]; - info.lo_init[1] = info64->lo_init[1]; - if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE); - else - memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); - memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info.lo_device != info64->lo_device || info.lo_rdevice != info64->lo_rdevice || info.lo_inode != info64->lo_inode || - info.lo_offset != info64->lo_offset || - info.lo_init[0] != info64->lo_init[0] || - info.lo_init[1] != info64->lo_init[1]) + info.lo_offset != info64->lo_offset) return -EOVERFLOW; if (copy_to_user(arg, &info, sizeof(info))) @@ -2101,43 +1794,6 @@ MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); -int loop_register_transfer(struct loop_func_table *funcs) -{ - unsigned int n = funcs->number; - - if (n >= MAX_LO_CRYPT || xfer_funcs[n]) - return -EINVAL; - xfer_funcs[n] = funcs; - return 0; -} - -int loop_unregister_transfer(int number) -{ - unsigned int n = number; - struct loop_func_table *xfer; - - if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL) - return -EINVAL; - /* - * This function is called from only cleanup_cryptoloop(). - * Given that each loop device that has a transfer enabled holds a - * reference to the module implementing it we should never get here - * with a transfer that is set (unless forced module unloading is - * requested). Thus, check module's refcount and warn if this is - * not a clean unloading. - */ -#ifdef CONFIG_MODULE_UNLOAD - if (xfer->owner && module_refcount(xfer->owner) != -1) - pr_err("Danger! Unregistering an in use transfer function.\n"); -#endif - - xfer_funcs[n] = NULL; - return 0; -} - -EXPORT_SYMBOL(loop_register_transfer); -EXPORT_SYMBOL(loop_unregister_transfer); - static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2394,13 +2050,19 @@ static int loop_add(int i) disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); /* Make this loop device reachable from pathname. */ - add_disk(disk); + err = add_disk(disk); + if (err) + goto out_cleanup_disk; + /* Show this loop device. */ mutex_lock(&loop_ctl_mutex); lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); + return i; +out_cleanup_disk: + blk_cleanup_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 04c88dd6eabd..082d4b6bfc6a 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -32,23 +32,10 @@ struct loop_device { loff_t lo_offset; loff_t lo_sizelimit; int lo_flags; - int (*transfer)(struct loop_device *, int cmd, - struct page *raw_page, unsigned raw_off, - struct page *loop_page, unsigned loop_off, - int size, sector_t real_block); char lo_file_name[LO_NAME_SIZE]; - char lo_crypt_name[LO_NAME_SIZE]; - char lo_encrypt_key[LO_KEY_SIZE]; - int lo_encrypt_key_size; - struct loop_func_table *lo_encryption; - __u32 lo_init[2]; - kuid_t lo_key_owner; /* Who set the key */ - int (*ioctl)(struct loop_device *, int cmd, - unsigned long arg); struct file * lo_backing_file; struct block_device *lo_device; - void *key_data; gfp_t old_gfp_mask; @@ -82,21 +69,4 @@ struct loop_cmd { struct cgroup_subsys_state *memcg_css; }; -/* Support for loadable transfer modules */ -struct loop_func_table { - int number; /* filter type */ - int (*transfer)(struct loop_device *lo, int cmd, - struct page *raw_page, unsigned raw_off, - struct page *loop_page, unsigned loop_off, - int size, sector_t real_block); - int (*init)(struct loop_device *, const struct loop_info64 *); - /* release is called from loop_unregister_transfer or clr_fd */ - int (*release)(struct loop_device *); - int (*ioctl)(struct loop_device *, int cmd, unsigned long arg); - struct module *owner; -}; - -int loop_register_transfer(struct loop_func_table *funcs); -int loop_unregister_transfer(int number); - #endif diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 901855717cb5..c91b9010c1a6 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3633,7 +3633,9 @@ skip_create_disk: set_capacity(dd->disk, capacity); /* Enable the block device and add it to /dev */ - device_add_disk(&dd->pdev->dev, dd->disk, mtip_disk_attr_groups); + rv = device_add_disk(&dd->pdev->dev, dd->disk, mtip_disk_attr_groups); + if (rv) + goto read_capacity_error; if (dd->mtip_svc_handler) { set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); @@ -4061,7 +4063,6 @@ block_initialize_err: msi_initialize_err: if (dd->isr_workq) { - flush_workqueue(dd->isr_workq); destroy_workqueue(dd->isr_workq); drop_cpu(dd->work[0].cpu_binding); drop_cpu(dd->work[1].cpu_binding); @@ -4119,7 +4120,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) mtip_block_remove(dd); if (dd->isr_workq) { - flush_workqueue(dd->isr_workq); destroy_workqueue(dd->isr_workq); drop_cpu(dd->work[0].cpu_binding); drop_cpu(dd->work[1].cpu_binding); diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 26798da661bd..78282f01f581 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -84,7 +84,7 @@ static bool n64cart_do_bvec(struct device *dev, struct bio_vec *bv, u32 pos) return true; } -static blk_qc_t n64cart_submit_bio(struct bio *bio) +static void n64cart_submit_bio(struct bio *bio) { struct bio_vec bvec; struct bvec_iter iter; @@ -92,16 +92,14 @@ static blk_qc_t n64cart_submit_bio(struct bio *bio) u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT; bio_for_each_segment(bvec, bio, iter) { - if (!n64cart_do_bvec(dev, &bvec, pos)) - goto io_error; + if (!n64cart_do_bvec(dev, &bvec, pos)) { + bio_io_error(bio); + return; + } pos += bvec.bv_len; } bio_endio(bio); - return BLK_QC_T_NONE; -io_error: - bio_io_error(bio); - return BLK_QC_T_NONE; } static const struct block_device_operations n64cart_fops = { @@ -117,6 +115,7 @@ static const struct block_device_operations n64cart_fops = { static int __init n64cart_probe(struct platform_device *pdev) { struct gendisk *disk; + int err = -ENOMEM; if (!start || !size) { pr_err("start or size not specified\n"); @@ -134,7 +133,7 @@ static int __init n64cart_probe(struct platform_device *pdev) disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) - return -ENOMEM; + goto out; disk->first_minor = 0; disk->flags = GENHD_FL_NO_PART_SCAN; @@ -149,11 +148,18 @@ static int __init n64cart_probe(struct platform_device *pdev) blk_queue_physical_block_size(disk->queue, 4096); blk_queue_logical_block_size(disk->queue, 4096); - add_disk(disk); + err = add_disk(disk); + if (err) + goto out_cleanup_disk; pr_info("n64cart: %u kb disk\n", size / 1024); return 0; + +out_cleanup_disk: + blk_cleanup_disk(disk); +out: + return err; } static struct platform_driver n64cart_driver = { diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1183f7872b71..b47b2a87ae8f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -122,15 +122,21 @@ struct nbd_device { struct work_struct remove_work; struct list_head list; - struct task_struct *task_recv; struct task_struct *task_setup; unsigned long flags; + pid_t pid; /* pid of nbd-client, if attached */ char *backend; }; #define NBD_CMD_REQUEUED 1 +/* + * This flag will be set if nbd_queue_rq() succeed, and will be checked and + * cleared in completion. Both setting and clearing of the flag are protected + * by cmd->lock. + */ +#define NBD_CMD_INFLIGHT 2 struct nbd_cmd { struct nbd_device *nbd; @@ -217,7 +223,7 @@ static ssize_t pid_show(struct device *dev, struct gendisk *disk = dev_to_disk(dev); struct nbd_device *nbd = (struct nbd_device *)disk->private_data; - return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); + return sprintf(buf, "%d\n", nbd->pid); } static const struct device_attribute pid_attr = { @@ -310,26 +316,19 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, nsock->sent = 0; } -static void nbd_size_clear(struct nbd_device *nbd) -{ - if (nbd->config->bytesize) { - set_capacity(nbd->disk, 0); - kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); - } -} - static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { if (!blksize) blksize = 1u << NBD_DEF_BLKSIZE_BITS; - if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize)) + + if (blk_validate_block_size(blksize)) return -EINVAL; nbd->config->bytesize = bytesize; nbd->config->blksize_bits = __ffs(blksize); - if (!nbd->task_recv) + if (!nbd->pid) return 0; if (nbd->config->flags & NBD_FLAG_SEND_TRIM) { @@ -405,6 +404,11 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return BLK_EH_DONE; + } + if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; mutex_unlock(&cmd->lock); @@ -484,7 +488,8 @@ done: } /* - * Send or receive packet. + * Send or receive packet. Return a positive value on success and + * negtive value on failue, and never return 0. */ static int sock_xmit(struct nbd_device *nbd, int index, int send, struct iov_iter *iter, int msg_flags, int *sent) @@ -610,7 +615,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) result = sock_xmit(nbd, index, 1, &from, (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); trace_nbd_header_sent(req, handle); - if (result <= 0) { + if (result < 0) { if (was_interrupted(result)) { /* If we havne't sent anything we can just return BUSY, * however if we have sent something we need to make @@ -654,7 +659,7 @@ send_pages: skip = 0; } result = sock_xmit(nbd, index, 1, &from, flags, &sent); - if (result <= 0) { + if (result < 0) { if (was_interrupted(result)) { /* We've already sent the header, we * have no choice but to set pending and @@ -688,38 +693,45 @@ out: return 0; } -/* NULL returned = something went wrong, inform userspace */ -static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) +static int nbd_read_reply(struct nbd_device *nbd, int index, + struct nbd_reply *reply) { - struct nbd_config *config = nbd->config; - int result; - struct nbd_reply reply; - struct nbd_cmd *cmd; - struct request *req = NULL; - u64 handle; - u16 hwq; - u32 tag; - struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)}; + struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)}; struct iov_iter to; - int ret = 0; + int result; - reply.magic = 0; - iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply)); + reply->magic = 0; + iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply)); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); - if (result <= 0) { - if (!nbd_disconnected(config)) + if (result < 0) { + if (!nbd_disconnected(nbd->config)) dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); - return ERR_PTR(result); + return result; } - if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { + if (ntohl(reply->magic) != NBD_REPLY_MAGIC) { dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", - (unsigned long)ntohl(reply.magic)); - return ERR_PTR(-EPROTO); + (unsigned long)ntohl(reply->magic)); + return -EPROTO; } - memcpy(&handle, reply.handle, sizeof(handle)); + return 0; +} + +/* NULL returned = something went wrong, inform userspace */ +static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, + struct nbd_reply *reply) +{ + int result; + struct nbd_cmd *cmd; + struct request *req = NULL; + u64 handle; + u16 hwq; + u32 tag; + int ret = 0; + + memcpy(&handle, reply->handle, sizeof(handle)); tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) @@ -734,6 +746,16 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", + tag, cmd->status, cmd->flags); + ret = -ENOENT; + goto out; + } + if (cmd->index != index) { + dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)", + tag, index, cmd->index); + } if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); @@ -752,9 +774,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) ret = -ENOENT; goto out; } - if (ntohl(reply.error)) { + if (ntohl(reply->error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", - ntohl(reply.error)); + ntohl(reply->error)); cmd->status = BLK_STS_IOERR; goto out; } @@ -763,11 +785,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; + struct iov_iter to; rq_for_each_segment(bvec, req, iter) { iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); - if (result <= 0) { + if (result < 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); /* @@ -776,7 +799,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) * and let the timeout stuff handle resubmitting * this request onto another connection. */ - if (nbd_disconnected(config)) { + if (nbd_disconnected(nbd->config)) { cmd->status = BLK_STS_IOERR; goto out; } @@ -800,24 +823,46 @@ static void recv_work(struct work_struct *work) work); struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; + struct request_queue *q = nbd->disk->queue; + struct nbd_sock *nsock; struct nbd_cmd *cmd; struct request *rq; while (1) { - cmd = nbd_read_stat(nbd, args->index); - if (IS_ERR(cmd)) { - struct nbd_sock *nsock = config->socks[args->index]; + struct nbd_reply reply; - mutex_lock(&nsock->tx_lock); - nbd_mark_nsock_dead(nbd, nsock, 1); - mutex_unlock(&nsock->tx_lock); + if (nbd_read_reply(nbd, args->index, &reply)) + break; + + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during nbd_handle_reply(). + * If queue is frozen, there won't be any inflight requests, we + * needn't to handle the incoming garbage message. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) { + dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n", + __func__); + break; + } + + cmd = nbd_handle_reply(nbd, args->index, &reply); + if (IS_ERR(cmd)) { + percpu_ref_put(&q->q_usage_counter); break; } rq = blk_mq_rq_from_pdu(cmd); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); + percpu_ref_put(&q->q_usage_counter); } + + nsock = config->socks[args->index]; + mutex_lock(&nsock->tx_lock); + nbd_mark_nsock_dead(nbd, nsock, 1); + mutex_unlock(&nsock->tx_lock); + nbd_config_put(nbd); atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); @@ -833,6 +878,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved) return true; mutex_lock(&cmd->lock); + if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return true; + } cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); @@ -914,7 +963,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) if (!refcount_inc_not_zero(&nbd->config_refs)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); - blk_mq_start_request(req); return -EINVAL; } config = nbd->config; @@ -923,7 +971,6 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted send on invalid socket\n"); nbd_config_put(nbd); - blk_mq_start_request(req); return -EINVAL; } cmd->status = BLK_STS_OK; @@ -947,7 +994,6 @@ again: */ sock_shutdown(nbd); nbd_config_put(nbd); - blk_mq_start_request(req); return -EIO; } goto again; @@ -969,7 +1015,13 @@ again: * returns EAGAIN can be retried on a different socket. */ ret = nbd_send_cmd(nbd, cmd, index); - if (ret == -EAGAIN) { + /* + * Access to this flag is protected by cmd->lock, thus it's safe to set + * the flag after nbd_send_cmd() succeed to send request to server. + */ + if (!ret) + __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); + else if (ret == -EAGAIN) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Request send failed, requeueing\n"); nbd_mark_nsock_dead(nbd, nsock, 1); @@ -1206,7 +1258,7 @@ static void send_disconnects(struct nbd_device *nbd) iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); mutex_lock(&nsock->tx_lock); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); - if (ret <= 0) + if (ret < 0) dev_err(disk_to_dev(nbd->disk), "Send disconnect failed %d\n", ret); mutex_unlock(&nsock->tx_lock); @@ -1237,11 +1289,13 @@ static void nbd_config_put(struct nbd_device *nbd) &nbd->config_lock)) { struct nbd_config *config = nbd->config; nbd_dev_dbg_close(nbd); - nbd_size_clear(nbd); + invalidate_disk(nbd->disk); + if (nbd->config->bytesize) + kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - nbd->task_recv = NULL; + nbd->pid = 0; if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags)) { device_remove_file(disk_to_dev(nbd->disk), &backend_attr); @@ -1282,7 +1336,7 @@ static int nbd_start_device(struct nbd_device *nbd) int num_connections = config->num_connections; int error = 0, i; - if (nbd->task_recv) + if (nbd->pid) return -EBUSY; if (!config->socks) return -EINVAL; @@ -1301,7 +1355,7 @@ static int nbd_start_device(struct nbd_device *nbd) } blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); - nbd->task_recv = current; + nbd->pid = task_pid_nr(current); nbd_parse_flags(nbd); @@ -1557,8 +1611,8 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) { struct nbd_device *nbd = s->private; - if (nbd->task_recv) - seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv)); + if (nbd->pid) + seq_printf(s, "recv: %d\n", nbd->pid); return 0; } @@ -1762,7 +1816,9 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); - add_disk(disk); + err = add_disk(disk); + if (err) + goto out_err_disk; /* * Now publish the device. @@ -1771,6 +1827,8 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) nbd_total_devices++; return nbd; +out_err_disk: + blk_cleanup_disk(disk); out_free_idr: mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, index); @@ -2135,7 +2193,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) mutex_lock(&nbd->config_lock); config = nbd->config; if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || - !nbd->task_recv) { + !nbd->pid) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); ret = -EINVAL; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 187d779c8ca0..323af5c9c802 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -92,6 +92,10 @@ static int g_submit_queues = 1; module_param_named(submit_queues, g_submit_queues, int, 0444); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); +static int g_poll_queues = 1; +module_param_named(poll_queues, g_poll_queues, int, 0444); +MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues"); + static int g_home_node = NUMA_NO_NODE; module_param_named(home_node, g_home_node, int, 0444); MODULE_PARM_DESC(home_node, "Home node for the device"); @@ -324,29 +328,69 @@ nullb_device_##NAME##_store(struct config_item *item, const char *page, \ } \ CONFIGFS_ATTR(nullb_device_, NAME); -static int nullb_apply_submit_queues(struct nullb_device *dev, - unsigned int submit_queues) +static int nullb_update_nr_hw_queues(struct nullb_device *dev, + unsigned int submit_queues, + unsigned int poll_queues) + { - struct nullb *nullb = dev->nullb; struct blk_mq_tag_set *set; + int ret, nr_hw_queues; - if (!nullb) + if (!dev->nullb) return 0; /* + * Make sure at least one queue exists for each of submit and poll. + */ + if (!submit_queues || !poll_queues) + return -EINVAL; + + /* * Make sure that null_init_hctx() does not access nullb->queues[] past * the end of that array. */ - if (submit_queues > nr_cpu_ids) + if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues) return -EINVAL; - set = nullb->tag_set; - blk_mq_update_nr_hw_queues(set, submit_queues); - return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; + + /* + * Keep previous and new queue numbers in nullb_device for reference in + * the call back function null_map_queues(). + */ + dev->prev_submit_queues = dev->submit_queues; + dev->prev_poll_queues = dev->poll_queues; + dev->submit_queues = submit_queues; + dev->poll_queues = poll_queues; + + set = dev->nullb->tag_set; + nr_hw_queues = submit_queues + poll_queues; + blk_mq_update_nr_hw_queues(set, nr_hw_queues); + ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM; + + if (ret) { + /* on error, revert the queue numbers */ + dev->submit_queues = dev->prev_submit_queues; + dev->poll_queues = dev->prev_poll_queues; + } + + return ret; +} + +static int nullb_apply_submit_queues(struct nullb_device *dev, + unsigned int submit_queues) +{ + return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); +} + +static int nullb_apply_poll_queues(struct nullb_device *dev, + unsigned int poll_queues) +{ + return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); } NULLB_DEVICE_ATTR(size, ulong, NULL); NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); +NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues); NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); @@ -466,6 +510,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_size, &nullb_device_attr_completion_nsec, &nullb_device_attr_submit_queues, + &nullb_device_attr_poll_queues, &nullb_device_attr_home_node, &nullb_device_attr_queue_mode, &nullb_device_attr_blocksize, @@ -593,6 +638,9 @@ static struct nullb_device *null_alloc_dev(void) dev->size = g_gb * 1024; dev->completion_nsec = g_completion_nsec; dev->submit_queues = g_submit_queues; + dev->prev_submit_queues = g_submit_queues; + dev->poll_queues = g_poll_queues; + dev->prev_poll_queues = g_poll_queues; dev->home_node = g_home_node; dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; @@ -1422,7 +1470,7 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb) return &nullb->queues[index]; } -static blk_qc_t null_submit_bio(struct bio *bio) +static void null_submit_bio(struct bio *bio) { sector_t sector = bio->bi_iter.bi_sector; sector_t nr_sectors = bio_sectors(bio); @@ -1434,7 +1482,6 @@ static blk_qc_t null_submit_bio(struct bio *bio) cmd->bio = bio; null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); - return BLK_QC_T_NONE; } static bool should_timeout_request(struct request *rq) @@ -1455,12 +1502,100 @@ static bool should_requeue_request(struct request *rq) return false; } +static int null_map_queues(struct blk_mq_tag_set *set) +{ + struct nullb *nullb = set->driver_data; + int i, qoff; + unsigned int submit_queues = g_submit_queues; + unsigned int poll_queues = g_poll_queues; + + if (nullb) { + struct nullb_device *dev = nullb->dev; + + /* + * Refer nr_hw_queues of the tag set to check if the expected + * number of hardware queues are prepared. If block layer failed + * to prepare them, use previous numbers of submit queues and + * poll queues to map queues. + */ + if (set->nr_hw_queues == + dev->submit_queues + dev->poll_queues) { + submit_queues = dev->submit_queues; + poll_queues = dev->poll_queues; + } else if (set->nr_hw_queues == + dev->prev_submit_queues + dev->prev_poll_queues) { + submit_queues = dev->prev_submit_queues; + poll_queues = dev->prev_poll_queues; + } else { + pr_warn("tag set has unexpected nr_hw_queues: %d\n", + set->nr_hw_queues); + return -EINVAL; + } + } + + for (i = 0, qoff = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + switch (i) { + case HCTX_TYPE_DEFAULT: + map->nr_queues = submit_queues; + break; + case HCTX_TYPE_READ: + map->nr_queues = 0; + continue; + case HCTX_TYPE_POLL: + map->nr_queues = poll_queues; + break; + } + map->queue_offset = qoff; + qoff += map->nr_queues; + blk_mq_map_queues(map); + } + + return 0; +} + +static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct nullb_queue *nq = hctx->driver_data; + LIST_HEAD(list); + int nr = 0; + + spin_lock(&nq->poll_lock); + list_splice_init(&nq->poll_list, &list); + spin_unlock(&nq->poll_lock); + + while (!list_empty(&list)) { + struct nullb_cmd *cmd; + struct request *req; + + req = list_first_entry(&list, struct request, queuelist); + list_del_init(&req->queuelist); + cmd = blk_mq_rq_to_pdu(req); + cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), + blk_rq_sectors(req)); + end_cmd(cmd); + nr++; + } + + return nr; +} + static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) { + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); pr_info("rq %p timed out\n", rq); + if (hctx->type == HCTX_TYPE_POLL) { + struct nullb_queue *nq = hctx->driver_data; + + spin_lock(&nq->poll_lock); + list_del_init(&rq->queuelist); + spin_unlock(&nq->poll_lock); + } + /* * If the device is marked as blocking (i.e. memory backed or zoned * device), the submission path may be blocked waiting for resources @@ -1481,10 +1616,11 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, struct nullb_queue *nq = hctx->driver_data; sector_t nr_sectors = blk_rq_sectors(bd->rq); sector_t sector = blk_rq_pos(bd->rq); + const bool is_poll = hctx->type == HCTX_TYPE_POLL; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - if (nq->dev->irqmode == NULL_IRQ_TIMER) { + if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) { hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; } @@ -1508,6 +1644,13 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } } + + if (is_poll) { + spin_lock(&nq->poll_lock); + list_add_tail(&bd->rq->queuelist, &nq->poll_list); + spin_unlock(&nq->poll_lock); + return BLK_STS_OK; + } if (cmd->fake_timeout) return BLK_STS_OK; @@ -1543,6 +1686,8 @@ static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) init_waitqueue_head(&nq->wait); nq->queue_depth = nullb->queue_depth; nq->dev = nullb->dev; + INIT_LIST_HEAD(&nq->poll_list); + spin_lock_init(&nq->poll_lock); } static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, @@ -1568,6 +1713,8 @@ static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, .complete = null_complete_rq, .timeout = null_timeout_rq, + .poll = null_poll, + .map_queues = null_map_queues, .init_hctx = null_init_hctx, .exit_hctx = null_exit_hctx, }; @@ -1664,13 +1811,17 @@ static int setup_commands(struct nullb_queue *nq) static int setup_queues(struct nullb *nullb) { - nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), + int nqueues = nr_cpu_ids; + + if (g_poll_queues) + nqueues += g_poll_queues; + + nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue), GFP_KERNEL); if (!nullb->queues) return -ENOMEM; nullb->queue_depth = nullb->dev->hw_queue_depth; - return 0; } @@ -1722,9 +1873,14 @@ static int null_gendisk_register(struct nullb *nullb) static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) { + int poll_queues; + set->ops = &null_mq_ops; set->nr_hw_queues = nullb ? nullb->dev->submit_queues : g_submit_queues; + poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues; + if (poll_queues) + set->nr_hw_queues += poll_queues; set->queue_depth = nullb ? nullb->dev->hw_queue_depth : g_hw_queue_depth; set->numa_node = nullb ? nullb->dev->home_node : g_home_node; @@ -1734,7 +1890,11 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) set->flags |= BLK_MQ_F_NO_SCHED; if (g_shared_tag_bitmap) set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; - set->driver_data = NULL; + set->driver_data = nullb; + if (g_poll_queues) + set->nr_maps = 3; + else + set->nr_maps = 1; if ((nullb && nullb->dev->blocking) || g_blocking) set->flags |= BLK_MQ_F_BLOCKING; @@ -1754,6 +1914,13 @@ static int null_validate_conf(struct nullb_device *dev) dev->submit_queues = nr_cpu_ids; else if (dev->submit_queues == 0) dev->submit_queues = 1; + dev->prev_submit_queues = dev->submit_queues; + + if (dev->poll_queues > g_poll_queues) + dev->poll_queues = g_poll_queues; + else if (dev->poll_queues == 0) + dev->poll_queues = 1; + dev->prev_poll_queues = dev->poll_queues; dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 64bef125d1df..78eb56b0ca55 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -32,6 +32,9 @@ struct nullb_queue { struct nullb_device *dev; unsigned int requeue_selection; + struct list_head poll_list; + spinlock_t poll_lock; + struct nullb_cmd *cmds; }; @@ -83,6 +86,9 @@ struct nullb_device { unsigned int zone_max_open; /* max number of open zones */ unsigned int zone_max_active; /* max number of active zones */ unsigned int submit_queues; /* number of submission queues */ + unsigned int prev_submit_queues; /* number of submission queues before change */ + unsigned int poll_queues; /* number of IOPOLL submission queues */ + unsigned int prev_poll_queues; /* number of IOPOLL submission queues before change */ unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ unsigned int blocksize; /* block size */ diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index f9cdd11f02f5..f6b1d63e96e1 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -183,8 +183,6 @@ static int pcd_audio_ioctl(struct cdrom_device_info *cdi, static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc); -static int pcd_detect(void); -static void pcd_probe_capabilities(void); static void do_pcd_read_drq(void); static blk_status_t pcd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd); @@ -302,53 +300,6 @@ static const struct blk_mq_ops pcd_mq_ops = { .queue_rq = pcd_queue_rq, }; -static void pcd_init_units(void) -{ - struct pcd_unit *cd; - int unit; - - pcd_drive_count = 0; - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - struct gendisk *disk; - - if (blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1, - BLK_MQ_F_SHOULD_MERGE)) - continue; - - disk = blk_mq_alloc_disk(&cd->tag_set, cd); - if (IS_ERR(disk)) { - blk_mq_free_tag_set(&cd->tag_set); - continue; - } - - INIT_LIST_HEAD(&cd->rq_list); - blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); - cd->disk = disk; - cd->pi = &cd->pia; - cd->present = 0; - cd->last_sense = 0; - cd->changed = 1; - cd->drive = (*drives[unit])[D_SLV]; - if ((*drives[unit])[D_PRT]) - pcd_drive_count++; - - cd->name = &cd->info.name[0]; - snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit); - cd->info.ops = &pcd_dops; - cd->info.handle = cd; - cd->info.speed = 0; - cd->info.capacity = 1; - cd->info.mask = 0; - disk->major = major; - disk->first_minor = unit; - disk->minors = 1; - strcpy(disk->disk_name, cd->name); /* umm... */ - disk->fops = &pcd_bdops; - disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; - disk->events = DISK_EVENT_MEDIA_CHANGE; - } -} - static int pcd_open(struct cdrom_device_info *cdi, int purpose) { struct pcd_unit *cd = cdi->handle; @@ -630,10 +581,11 @@ static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr) return CDS_DISC_OK; } -static int pcd_identify(struct pcd_unit *cd, char *id) +static int pcd_identify(struct pcd_unit *cd) { - int k, s; char id_cmd[12] = { 0x12, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 }; + char id[18]; + int k, s; pcd_bufblk = -1; @@ -661,108 +613,47 @@ static int pcd_identify(struct pcd_unit *cd, char *id) } /* - * returns 0, with id set if drive is detected - * -1, if drive detection failed + * returns 0, with id set if drive is detected, otherwise an error code. */ -static int pcd_probe(struct pcd_unit *cd, int ms, char *id) +static int pcd_probe(struct pcd_unit *cd, int ms) { if (ms == -1) { for (cd->drive = 0; cd->drive <= 1; cd->drive++) - if (!pcd_reset(cd) && !pcd_identify(cd, id)) + if (!pcd_reset(cd) && !pcd_identify(cd)) return 0; } else { cd->drive = ms; - if (!pcd_reset(cd) && !pcd_identify(cd, id)) + if (!pcd_reset(cd) && !pcd_identify(cd)) return 0; } - return -1; + return -ENODEV; } -static void pcd_probe_capabilities(void) +static int pcd_probe_capabilities(struct pcd_unit *cd) { - int unit, r; - char buffer[32]; char cmd[12] = { 0x5a, 1 << 3, 0x2a, 0, 0, 0, 0, 18, 0, 0, 0, 0 }; - struct pcd_unit *cd; - - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - if (!cd->present) - continue; - r = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities"); - if (r) - continue; - /* we should now have the cap page */ - if ((buffer[11] & 1) == 0) - cd->info.mask |= CDC_CD_R; - if ((buffer[11] & 2) == 0) - cd->info.mask |= CDC_CD_RW; - if ((buffer[12] & 1) == 0) - cd->info.mask |= CDC_PLAY_AUDIO; - if ((buffer[14] & 1) == 0) - cd->info.mask |= CDC_LOCK; - if ((buffer[14] & 8) == 0) - cd->info.mask |= CDC_OPEN_TRAY; - if ((buffer[14] >> 6) == 0) - cd->info.mask |= CDC_CLOSE_TRAY; - } -} - -static int pcd_detect(void) -{ - char id[18]; - int k, unit; - struct pcd_unit *cd; + char buffer[32]; + int ret; - printk("%s: %s version %s, major %d, nice %d\n", - name, name, PCD_VERSION, major, nice); + ret = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities"); + if (ret) + return ret; + + /* we should now have the cap page */ + if ((buffer[11] & 1) == 0) + cd->info.mask |= CDC_CD_R; + if ((buffer[11] & 2) == 0) + cd->info.mask |= CDC_CD_RW; + if ((buffer[12] & 1) == 0) + cd->info.mask |= CDC_PLAY_AUDIO; + if ((buffer[14] & 1) == 0) + cd->info.mask |= CDC_LOCK; + if ((buffer[14] & 8) == 0) + cd->info.mask |= CDC_OPEN_TRAY; + if ((buffer[14] >> 6) == 0) + cd->info.mask |= CDC_CLOSE_TRAY; - par_drv = pi_register_driver(name); - if (!par_drv) { - pr_err("failed to register %s driver\n", name); - return -1; - } - - k = 0; - if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ - cd = pcd; - if (cd->disk && pi_init(cd->pi, 1, -1, -1, -1, -1, -1, - pcd_buffer, PI_PCD, verbose, cd->name)) { - if (!pcd_probe(cd, -1, id)) { - cd->present = 1; - k++; - } else - pi_release(cd->pi); - } - } else { - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - int *conf = *drives[unit]; - if (!conf[D_PRT]) - continue; - if (!cd->disk) - continue; - if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD], - conf[D_UNI], conf[D_PRO], conf[D_DLY], - pcd_buffer, PI_PCD, verbose, cd->name)) - continue; - if (!pcd_probe(cd, conf[D_SLV], id)) { - cd->present = 1; - k++; - } else - pi_release(cd->pi); - } - } - if (k) - return 0; - - printk("%s: No CD-ROM drive found\n", name); - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - if (!cd->disk) - continue; - blk_cleanup_disk(cd->disk); - blk_mq_free_tag_set(&cd->tag_set); - } - pi_unregister_driver(par_drv); - return -1; + return 0; } /* I/O request processing */ @@ -999,43 +890,130 @@ static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn) return 0; } +static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port, + int mode, int unit, int protocol, int delay, int ms) +{ + struct gendisk *disk; + int ret; + + ret = blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1, + BLK_MQ_F_SHOULD_MERGE); + if (ret) + return ret; + + disk = blk_mq_alloc_disk(&cd->tag_set, cd); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); + goto out_free_tag_set; + } + + INIT_LIST_HEAD(&cd->rq_list); + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); + cd->disk = disk; + cd->pi = &cd->pia; + cd->present = 0; + cd->last_sense = 0; + cd->changed = 1; + cd->drive = (*drives[cd - pcd])[D_SLV]; + + cd->name = &cd->info.name[0]; + snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit); + cd->info.ops = &pcd_dops; + cd->info.handle = cd; + cd->info.speed = 0; + cd->info.capacity = 1; + cd->info.mask = 0; + disk->major = major; + disk->first_minor = unit; + disk->minors = 1; + strcpy(disk->disk_name, cd->name); /* umm... */ + disk->fops = &pcd_bdops; + disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; + disk->events = DISK_EVENT_MEDIA_CHANGE; + + if (!pi_init(cd->pi, autoprobe, port, mode, unit, protocol, delay, + pcd_buffer, PI_PCD, verbose, cd->name)) { + ret = -ENODEV; + goto out_free_disk; + } + ret = pcd_probe(cd, ms); + if (ret) + goto out_pi_release; + + cd->present = 1; + pcd_probe_capabilities(cd); + ret = register_cdrom(cd->disk, &cd->info); + if (ret) + goto out_pi_release; + ret = add_disk(cd->disk); + if (ret) + goto out_unreg_cdrom; + return 0; + +out_unreg_cdrom: + unregister_cdrom(&cd->info); +out_pi_release: + pi_release(cd->pi); +out_free_disk: + blk_cleanup_disk(cd->disk); +out_free_tag_set: + blk_mq_free_tag_set(&cd->tag_set); + return ret; +} + static int __init pcd_init(void) { - struct pcd_unit *cd; - int unit; + int found = 0, unit; if (disable) return -EINVAL; - pcd_init_units(); + if (register_blkdev(major, name)) + return -EBUSY; - if (pcd_detect()) - return -ENODEV; + pr_info("%s: %s version %s, major %d, nice %d\n", + name, name, PCD_VERSION, major, nice); - /* get the atapi capabilities page */ - pcd_probe_capabilities(); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + goto out_unregister_blkdev; + } - if (register_blkdev(major, name)) { - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - if (!cd->disk) - continue; + for (unit = 0; unit < PCD_UNITS; unit++) { + if ((*drives[unit])[D_PRT]) + pcd_drive_count++; + } + + if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ + if (!pcd_init_unit(pcd, 1, -1, -1, -1, -1, -1, -1)) + found++; + } else { + for (unit = 0; unit < PCD_UNITS; unit++) { + struct pcd_unit *cd = &pcd[unit]; + int *conf = *drives[unit]; - blk_cleanup_queue(cd->disk->queue); - blk_mq_free_tag_set(&cd->tag_set); - put_disk(cd->disk); + if (!conf[D_PRT]) + continue; + if (!pcd_init_unit(cd, 0, conf[D_PRT], conf[D_MOD], + conf[D_UNI], conf[D_PRO], conf[D_DLY], + conf[D_SLV])) + found++; } - return -EBUSY; } - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - if (cd->present) { - register_cdrom(cd->disk, &cd->info); - cd->disk->private_data = cd; - add_disk(cd->disk); - } + if (!found) { + pr_info("%s: No CD-ROM drive found\n", name); + goto out_unregister_pi_driver; } return 0; + +out_unregister_pi_driver: + pi_unregister_driver(par_drv); +out_unregister_blkdev: + unregister_blkdev(major, name); + return -ENODEV; } static void __exit pcd_exit(void) @@ -1044,20 +1022,18 @@ static void __exit pcd_exit(void) int unit; for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - if (!cd->disk) + if (!cd->present) continue; - if (cd->present) { - del_gendisk(cd->disk); - pi_release(cd->pi); - unregister_cdrom(&cd->info); - } - blk_cleanup_queue(cd->disk->queue); + unregister_cdrom(&cd->info); + del_gendisk(cd->disk); + pi_release(cd->pi); + blk_cleanup_disk(cd->disk); + blk_mq_free_tag_set(&cd->tag_set); - put_disk(cd->disk); } - unregister_blkdev(major, name); pi_unregister_driver(par_drv); + unregister_blkdev(major, name); } MODULE_LICENSE("GPL"); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 675327df6aff..fba865058a17 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -775,14 +775,14 @@ static int pd_special_command(struct pd_unit *disk, struct request *rq; struct pd_req *req; - rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0); + rq = blk_mq_alloc_request(disk->gd->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); req = blk_mq_rq_to_pdu(rq); req->func = func; blk_execute_rq(disk->gd, rq, 0); - blk_put_request(rq); + blk_mq_free_request(rq); return 0; } @@ -875,9 +875,27 @@ static const struct blk_mq_ops pd_mq_ops = { .queue_rq = pd_queue_rq, }; -static void pd_probe_drive(struct pd_unit *disk) +static int pd_probe_drive(struct pd_unit *disk, int autoprobe, int port, + int mode, int unit, int protocol, int delay) { + int index = disk - pd; + int *parm = *drives[index]; struct gendisk *p; + int ret; + + disk->pi = &disk->pia; + disk->access = 0; + disk->changed = 1; + disk->capacity = 0; + disk->drive = parm[D_SLV]; + snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a' + index); + disk->alt_geom = parm[D_GEO]; + disk->standby = parm[D_SBY]; + INIT_LIST_HEAD(&disk->rq_list); + + if (!pi_init(disk->pi, autoprobe, port, mode, unit, protocol, delay, + pd_scratch, PI_PD, verbose, disk->name)) + return -ENXIO; memset(&disk->tag_set, 0, sizeof(disk->tag_set)); disk->tag_set.ops = &pd_mq_ops; @@ -887,14 +905,14 @@ static void pd_probe_drive(struct pd_unit *disk) disk->tag_set.queue_depth = 2; disk->tag_set.numa_node = NUMA_NO_NODE; disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; - - if (blk_mq_alloc_tag_set(&disk->tag_set)) - return; + ret = blk_mq_alloc_tag_set(&disk->tag_set); + if (ret) + goto pi_release; p = blk_mq_alloc_disk(&disk->tag_set, disk); if (IS_ERR(p)) { - blk_mq_free_tag_set(&disk->tag_set); - return; + ret = PTR_ERR(p); + goto free_tag_set; } disk->gd = p; @@ -905,102 +923,88 @@ static void pd_probe_drive(struct pd_unit *disk) p->minors = 1 << PD_BITS; p->events = DISK_EVENT_MEDIA_CHANGE; p->private_data = disk; - blk_queue_max_hw_sectors(p->queue, cluster); blk_queue_bounce_limit(p->queue, BLK_BOUNCE_HIGH); if (disk->drive == -1) { - for (disk->drive = 0; disk->drive <= 1; disk->drive++) - if (pd_special_command(disk, pd_identify) == 0) - return; - } else if (pd_special_command(disk, pd_identify) == 0) - return; - disk->gd = NULL; + for (disk->drive = 0; disk->drive <= 1; disk->drive++) { + ret = pd_special_command(disk, pd_identify); + if (ret == 0) + break; + } + } else { + ret = pd_special_command(disk, pd_identify); + } + if (ret) + goto put_disk; + set_capacity(disk->gd, disk->capacity); + ret = add_disk(disk->gd); + if (ret) + goto cleanup_disk; + return 0; +cleanup_disk: + blk_cleanup_disk(disk->gd); +put_disk: put_disk(p); + disk->gd = NULL; +free_tag_set: + blk_mq_free_tag_set(&disk->tag_set); +pi_release: + pi_release(disk->pi); + return ret; } -static int pd_detect(void) +static int __init pd_init(void) { int found = 0, unit, pd_drive_count = 0; struct pd_unit *disk; - for (unit = 0; unit < PD_UNITS; unit++) { - int *parm = *drives[unit]; - struct pd_unit *disk = pd + unit; - disk->pi = &disk->pia; - disk->access = 0; - disk->changed = 1; - disk->capacity = 0; - disk->drive = parm[D_SLV]; - snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a'+unit); - disk->alt_geom = parm[D_GEO]; - disk->standby = parm[D_SBY]; - if (parm[D_PRT]) - pd_drive_count++; - INIT_LIST_HEAD(&disk->rq_list); - } + if (disable) + return -ENODEV; + + if (register_blkdev(major, name)) + return -ENODEV; + + printk("%s: %s version %s, major %d, cluster %d, nice %d\n", + name, name, PD_VERSION, major, cluster, nice); par_drv = pi_register_driver(name); if (!par_drv) { pr_err("failed to register %s driver\n", name); - return -1; + goto out_unregister_blkdev; } - if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ - disk = pd; - if (pi_init(disk->pi, 1, -1, -1, -1, -1, -1, pd_scratch, - PI_PD, verbose, disk->name)) { - pd_probe_drive(disk); - if (!disk->gd) - pi_release(disk->pi); - } + for (unit = 0; unit < PD_UNITS; unit++) { + int *parm = *drives[unit]; + if (parm[D_PRT]) + pd_drive_count++; + } + + if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ + if (!pd_probe_drive(pd, 1, -1, -1, -1, -1, -1)) + found++; } else { for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) { int *parm = *drives[unit]; if (!parm[D_PRT]) continue; - if (pi_init(disk->pi, 0, parm[D_PRT], parm[D_MOD], - parm[D_UNI], parm[D_PRO], parm[D_DLY], - pd_scratch, PI_PD, verbose, disk->name)) { - pd_probe_drive(disk); - if (!disk->gd) - pi_release(disk->pi); - } - } - } - for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) { - if (disk->gd) { - set_capacity(disk->gd, disk->capacity); - add_disk(disk->gd); - found = 1; + if (!pd_probe_drive(disk, 0, parm[D_PRT], parm[D_MOD], + parm[D_UNI], parm[D_PRO], parm[D_DLY])) + found++; } } if (!found) { printk("%s: no valid drive found\n", name); - pi_unregister_driver(par_drv); + goto out_pi_unregister_driver; } - return found; -} - -static int __init pd_init(void) -{ - if (disable) - goto out1; - - if (register_blkdev(major, name)) - goto out1; - - printk("%s: %s version %s, major %d, cluster %d, nice %d\n", - name, name, PD_VERSION, major, cluster, nice); - if (!pd_detect()) - goto out2; return 0; -out2: +out_pi_unregister_driver: + pi_unregister_driver(par_drv); +out_unregister_blkdev: unregister_blkdev(major, name); -out1: return -ENODEV; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index d5b9c88ba76f..bf8d0ef41a0a 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -214,7 +214,6 @@ static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo); static void pf_release(struct gendisk *disk, fmode_t mode); -static int pf_detect(void); static void do_pf_read(void); static void do_pf_read_start(void); static void do_pf_write(void); @@ -285,45 +284,6 @@ static const struct blk_mq_ops pf_mq_ops = { .queue_rq = pf_queue_rq, }; -static void __init pf_init_units(void) -{ - struct pf_unit *pf; - int unit; - - pf_drive_count = 0; - for (unit = 0, pf = units; unit < PF_UNITS; unit++, pf++) { - struct gendisk *disk; - - if (blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1, - BLK_MQ_F_SHOULD_MERGE)) - continue; - - disk = blk_mq_alloc_disk(&pf->tag_set, pf); - if (IS_ERR(disk)) { - blk_mq_free_tag_set(&pf->tag_set); - continue; - } - - INIT_LIST_HEAD(&pf->rq_list); - blk_queue_max_segments(disk->queue, cluster); - blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); - pf->disk = disk; - pf->pi = &pf->pia; - pf->media_status = PF_NM; - pf->drive = (*drives[unit])[D_SLV]; - pf->lun = (*drives[unit])[D_LUN]; - snprintf(pf->name, PF_NAMELEN, "%s%d", name, unit); - disk->major = major; - disk->first_minor = unit; - disk->minors = 1; - strcpy(disk->disk_name, pf->name); - disk->fops = &pf_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; - if (!(*drives[unit])[D_PRT]) - pf_drive_count++; - } -} - static int pf_open(struct block_device *bdev, fmode_t mode) { struct pf_unit *pf = bdev->bd_disk->private_data; @@ -691,9 +651,9 @@ static int pf_identify(struct pf_unit *pf) return 0; } -/* returns 0, with id set if drive is detected - -1, if drive detection failed -*/ +/* + * returns 0, with id set if drive is detected, otherwise an error code. + */ static int pf_probe(struct pf_unit *pf) { if (pf->drive == -1) { @@ -715,60 +675,7 @@ static int pf_probe(struct pf_unit *pf) if (!pf_identify(pf)) return 0; } - return -1; -} - -static int pf_detect(void) -{ - struct pf_unit *pf = units; - int k, unit; - - printk("%s: %s version %s, major %d, cluster %d, nice %d\n", - name, name, PF_VERSION, major, cluster, nice); - - par_drv = pi_register_driver(name); - if (!par_drv) { - pr_err("failed to register %s driver\n", name); - return -1; - } - k = 0; - if (pf_drive_count == 0) { - if (pi_init(pf->pi, 1, -1, -1, -1, -1, -1, pf_scratch, PI_PF, - verbose, pf->name)) { - if (!pf_probe(pf) && pf->disk) { - pf->present = 1; - k++; - } else - pi_release(pf->pi); - } - - } else - for (unit = 0; unit < PF_UNITS; unit++, pf++) { - int *conf = *drives[unit]; - if (!conf[D_PRT]) - continue; - if (pi_init(pf->pi, 0, conf[D_PRT], conf[D_MOD], - conf[D_UNI], conf[D_PRO], conf[D_DLY], - pf_scratch, PI_PF, verbose, pf->name)) { - if (pf->disk && !pf_probe(pf)) { - pf->present = 1; - k++; - } else - pi_release(pf->pi); - } - } - if (k) - return 0; - - printk("%s: No ATAPI disk detected\n", name); - for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { - if (!pf->disk) - continue; - blk_cleanup_disk(pf->disk); - blk_mq_free_tag_set(&pf->tag_set); - } - pi_unregister_driver(par_drv); - return -1; + return -ENODEV; } /* The i/o request engine */ @@ -1014,61 +921,134 @@ static void do_pf_write_done(void) next_request(0); } +static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port, + int mode, int unit, int protocol, int delay, int ms) +{ + struct gendisk *disk; + int ret; + + ret = blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1, + BLK_MQ_F_SHOULD_MERGE); + if (ret) + return ret; + + disk = blk_mq_alloc_disk(&pf->tag_set, pf); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); + goto out_free_tag_set; + } + disk->major = major; + disk->first_minor = pf - units; + disk->minors = 1; + strcpy(disk->disk_name, pf->name); + disk->fops = &pf_fops; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->private_data = pf; + + blk_queue_max_segments(disk->queue, cluster); + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); + + INIT_LIST_HEAD(&pf->rq_list); + pf->disk = disk; + pf->pi = &pf->pia; + pf->media_status = PF_NM; + pf->drive = (*drives[disk->first_minor])[D_SLV]; + pf->lun = (*drives[disk->first_minor])[D_LUN]; + snprintf(pf->name, PF_NAMELEN, "%s%d", name, disk->first_minor); + + if (!pi_init(pf->pi, autoprobe, port, mode, unit, protocol, delay, + pf_scratch, PI_PF, verbose, pf->name)) { + ret = -ENODEV; + goto out_free_disk; + } + ret = pf_probe(pf); + if (ret) + goto out_pi_release; + + ret = add_disk(disk); + if (ret) + goto out_pi_release; + pf->present = 1; + return 0; + +out_pi_release: + pi_release(pf->pi); +out_free_disk: + blk_cleanup_disk(pf->disk); +out_free_tag_set: + blk_mq_free_tag_set(&pf->tag_set); + return ret; +} + static int __init pf_init(void) { /* preliminary initialisation */ struct pf_unit *pf; - int unit; + int found = 0, unit; if (disable) return -EINVAL; - pf_init_units(); + if (register_blkdev(major, name)) + return -EBUSY; - if (pf_detect()) - return -ENODEV; - pf_busy = 0; + printk("%s: %s version %s, major %d, cluster %d, nice %d\n", + name, name, PF_VERSION, major, cluster, nice); - if (register_blkdev(major, name)) { - for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { - if (!pf->disk) - continue; - blk_cleanup_queue(pf->disk->queue); - blk_mq_free_tag_set(&pf->tag_set); - put_disk(pf->disk); - } - return -EBUSY; + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + goto out_unregister_blkdev; } - for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { - struct gendisk *disk = pf->disk; + for (unit = 0; unit < PF_UNITS; unit++) { + if (!(*drives[unit])[D_PRT]) + pf_drive_count++; + } - if (!pf->present) - continue; - disk->private_data = pf; - add_disk(disk); + pf = units; + if (pf_drive_count == 0) { + if (pf_init_unit(pf, 1, -1, -1, -1, -1, -1, verbose)) + found++; + } else { + for (unit = 0; unit < PF_UNITS; unit++, pf++) { + int *conf = *drives[unit]; + if (!conf[D_PRT]) + continue; + if (pf_init_unit(pf, 0, conf[D_PRT], conf[D_MOD], + conf[D_UNI], conf[D_PRO], conf[D_DLY], + verbose)) + found++; + } + } + if (!found) { + printk("%s: No ATAPI disk detected\n", name); + goto out_unregister_pi_driver; } + pf_busy = 0; return 0; + +out_unregister_pi_driver: + pi_unregister_driver(par_drv); +out_unregister_blkdev: + unregister_blkdev(major, name); + return -ENODEV; } static void __exit pf_exit(void) { struct pf_unit *pf; int unit; - unregister_blkdev(major, name); + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { - if (!pf->disk) + if (!pf->present) continue; - - if (pf->present) - del_gendisk(pf->disk); - - blk_cleanup_queue(pf->disk->queue); + del_gendisk(pf->disk); + blk_cleanup_disk(pf->disk); blk_mq_free_tag_set(&pf->tag_set); - put_disk(pf->disk); - - if (pf->present) - pi_release(pf->pi); + pi_release(pf->pi); } + + unregister_blkdev(major, name); } MODULE_LICENSE("GPL"); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 0f26b2510a75..b53f648302c1 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -703,7 +703,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * struct request *rq; int ret = 0; - rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? + rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); @@ -726,7 +726,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * if (scsi_req(rq)->result) ret = -EIO; out: - blk_put_request(rq); + blk_mq_free_request(rq); return ret; } @@ -2400,7 +2400,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) } } -static blk_qc_t pkt_submit_bio(struct bio *bio) +static void pkt_submit_bio(struct bio *bio) { struct pktcdvd_device *pd; char b[BDEVNAME_SIZE]; @@ -2423,7 +2423,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) */ if (bio_data_dir(bio) == READ) { pkt_make_request_read(pd, bio); - return BLK_QC_T_NONE; + return; } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { @@ -2455,10 +2455,9 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); } while (split != bio); - return BLK_QC_T_NONE; + return; end_io: bio_io_error(bio); - return BLK_QC_T_NONE; } static void pkt_init_queue(struct pktcdvd_device *pd) @@ -2537,6 +2536,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) int i; char b[BDEVNAME_SIZE]; struct block_device *bdev; + struct scsi_device *sdev; if (pd->pkt_dev == dev) { pkt_err(pd, "recursive setup not allowed\n"); @@ -2560,10 +2560,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); - if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { + sdev = scsi_device_from_queue(bdev->bd_disk->queue); + if (!sdev) { blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); return -EINVAL; } + put_device(&sdev->sdev_gendev); /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); @@ -2729,7 +2731,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) /* inherit events of the host device */ disk->events = pd->bdev->bd_disk->events; - add_disk(disk); + ret = add_disk(disk); + if (ret) + goto out_mem2; pkt_sysfs_dev_new(pd); pkt_debugfs_dev_new(pd); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index c7b19e128b03..d1ebf193cb9a 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -578,7 +578,7 @@ out: return next; } -static blk_qc_t ps3vram_submit_bio(struct bio *bio) +static void ps3vram_submit_bio(struct bio *bio) { struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -594,13 +594,11 @@ static blk_qc_t ps3vram_submit_bio(struct bio *bio) spin_unlock_irq(&priv->lock); if (busy) - return BLK_QC_T_NONE; + return; do { bio = ps3vram_do_bio(dev, bio); } while (bio); - - return BLK_QC_T_NONE; } static const struct block_device_operations ps3vram_fops = { diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e65c9d706f6f..953fa134cd3d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -836,7 +836,7 @@ struct rbd_options { u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ }; -#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ +#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_DEFAULT_RQ #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ #define RBD_READ_ONLY_DEFAULT false @@ -7054,7 +7054,9 @@ static ssize_t do_rbd_add(struct bus_type *bus, if (rc) goto err_out_image_lock; - device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); + rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); + if (rc) + goto err_out_cleanup_disk; spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); @@ -7068,6 +7070,8 @@ out: module_put(THIS_MODULE); return rc; +err_out_cleanup_disk: + rbd_free_disk(rbd_dev); err_out_image_lock: rbd_dev_image_unlock(rbd_dev); rbd_dev_device_release(rbd_dev); diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 4b93fd83bf79..44e45af00e83 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -71,8 +71,10 @@ static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, int opt_mask = 0; int token; int ret = -EINVAL; - int i, dest_port, nr_poll_queues; + int nr_poll_queues = 0; + int dest_port = 0; int p_cnt = 0; + int i; options = kstrdup(buf, GFP_KERNEL); if (!options) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index bd4a41afbbfc..2df0657cdf00 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1176,7 +1176,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } -static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx) +static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct rnbd_queue *q = hctx->driver_data; struct rnbd_clt_dev *dev = q->dev; @@ -1384,8 +1384,10 @@ static void setup_request_queue(struct rnbd_clt_dev *dev) blk_queue_write_cache(dev->queue, dev->wc, dev->fua); } -static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) +static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) { + int err; + dev->gd->major = rnbd_client_major; dev->gd->first_minor = idx << RNBD_PART_BITS; dev->gd->minors = 1 << RNBD_PART_BITS; @@ -1410,7 +1412,11 @@ static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) if (!dev->rotational) blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); - add_disk(dev->gd); + err = add_disk(dev->gd); + if (err) + blk_cleanup_disk(dev->gd); + + return err; } static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) @@ -1426,8 +1432,7 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) rnbd_init_mq_hw_queues(dev); setup_request_queue(dev); - rnbd_clt_setup_gen_disk(dev, idx); - return 0; + return rnbd_clt_setup_gen_disk(dev, idx); } static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index c1bc5c0fef71..de5d5a8df81d 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -10,7 +10,7 @@ #define RNBD_PROTO_H #include <linux/types.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/limits.h> #include <linux/inet.h> #include <linux/in.h> diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 83636714b8d7..8d9d69f5dfbc 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -935,7 +935,9 @@ static int rsxx_pci_probe(struct pci_dev *dev, card->size8 = 0; } - rsxx_attach_dev(card); + st = rsxx_attach_dev(card); + if (st) + goto failed_create_dev; /************* Setup Debugfs *************/ rsxx_debugfs_dev_new(card); diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 1cc40b0ea761..dd33f1bdf3b8 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -50,7 +50,7 @@ struct rsxx_bio_meta { static struct kmem_cache *bio_meta_pool; -static blk_qc_t rsxx_submit_bio(struct bio *bio); +static void rsxx_submit_bio(struct bio *bio); /*----------------- Block Device Operations -----------------*/ static int rsxx_blkdev_ioctl(struct block_device *bdev, @@ -120,7 +120,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, } } -static blk_qc_t rsxx_submit_bio(struct bio *bio) +static void rsxx_submit_bio(struct bio *bio) { struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data; struct rsxx_bio_meta *bio_meta; @@ -169,7 +169,7 @@ static blk_qc_t rsxx_submit_bio(struct bio *bio) if (st) goto queue_err; - return BLK_QC_T_NONE; + return; queue_err: kmem_cache_free(bio_meta_pool, bio_meta); @@ -177,7 +177,6 @@ req_err: if (st) bio->bi_status = st; bio_endio(bio); - return BLK_QC_T_NONE; } /*----------------- Device Setup -------------------*/ @@ -192,6 +191,8 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card) int rsxx_attach_dev(struct rsxx_cardinfo *card) { + int err = 0; + mutex_lock(&card->dev_lock); /* The block device requires the stripe size from the config. */ @@ -200,13 +201,17 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card) set_capacity(card->gendisk, card->size8 >> 9); else set_capacity(card->gendisk, 0); - device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL); - card->bdev_attached = 1; + err = device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL); + if (err == 0) + card->bdev_attached = 1; } mutex_unlock(&card->dev_lock); - return 0; + if (err) + blk_cleanup_disk(card->gendisk); + + return err; } void rsxx_detach_dev(struct rsxx_cardinfo *card) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 7ccc8d2a41bc..821594cd1315 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -16,6 +16,7 @@ #include <linux/fd.h> #include <linux/slab.h> #include <linux/blk-mq.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/hdreg.h> #include <linux/kernel.h> @@ -184,6 +185,7 @@ struct floppy_state { int track; int ref_count; + bool registered; struct gendisk *disk; struct blk_mq_tag_set tag_set; @@ -771,6 +773,20 @@ static const struct blk_mq_ops swim_mq_ops = { .queue_rq = swim_queue_rq, }; +static void swim_cleanup_floppy_disk(struct floppy_state *fs) +{ + struct gendisk *disk = fs->disk; + + if (!disk) + return; + + if (fs->registered) + del_gendisk(fs->disk); + + blk_cleanup_disk(disk); + blk_mq_free_tag_set(&fs->tag_set); +} + static int swim_floppy_init(struct swim_priv *swd) { int err; @@ -827,7 +843,10 @@ static int swim_floppy_init(struct swim_priv *swd) swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE; swd->unit[drive].disk->private_data = &swd->unit[drive]; set_capacity(swd->unit[drive].disk, 2880); - add_disk(swd->unit[drive].disk); + err = add_disk(swd->unit[drive].disk); + if (err) + goto exit_put_disks; + swd->unit[drive].registered = true; } return 0; @@ -835,12 +854,7 @@ static int swim_floppy_init(struct swim_priv *swd) exit_put_disks: unregister_blkdev(FLOPPY_MAJOR, "fd"); do { - struct gendisk *disk = swd->unit[drive].disk; - - if (!disk) - continue; - blk_cleanup_disk(disk); - blk_mq_free_tag_set(&swd->unit[drive].tag_set); + swim_cleanup_floppy_disk(&swd->unit[drive]); } while (drive--); return err; } @@ -909,12 +923,8 @@ static int swim_remove(struct platform_device *dev) int drive; struct resource *res; - for (drive = 0; drive < swd->floppy_count; drive++) { - del_gendisk(swd->unit[drive].disk); - blk_cleanup_queue(swd->unit[drive].disk->queue); - blk_mq_free_tag_set(&swd->unit[drive].tag_set); - put_disk(swd->unit[drive].disk); - } + for (drive = 0; drive < swd->floppy_count; drive++) + swim_cleanup_floppy_disk(&swd->unit[drive]); unregister_blkdev(FLOPPY_MAJOR, "fd"); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 965af0a3e95b..4b91c9aa5892 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/wait.h> +#include <linux/major.h> #include <asm/io.h> #include <asm/dbdma.h> #include <asm/prom.h> @@ -1229,7 +1230,9 @@ static int swim3_attach(struct macio_dev *mdev, disk->flags |= GENHD_FL_REMOVABLE; sprintf(disk->disk_name, "fd%d", floppy_count); set_capacity(disk, 2880); - add_disk(disk); + rc = add_disk(disk); + if (rc) + goto out_cleanup_disk; disks[floppy_count++] = disk; return 0; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 420cd952ddc4..d1676fe0da1a 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -297,6 +297,7 @@ struct carm_host { struct work_struct fsm_task; + int probe_err; struct completion probe_comp; }; @@ -1181,8 +1182,11 @@ static void carm_fsm_task (struct work_struct *work) struct gendisk *disk = port->disk; set_capacity(disk, port->capacity); - add_disk(disk); - activated++; + host->probe_err = add_disk(disk); + if (!host->probe_err) + activated++; + else + break; } printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n", @@ -1192,11 +1196,9 @@ static void carm_fsm_task (struct work_struct *work) reschedule = 1; break; } - case HST_PROBE_FINISHED: complete(&host->probe_comp); break; - case HST_ERROR: /* FIXME: TODO */ break; @@ -1507,7 +1509,12 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) goto err_out_free_irq; DPRINTK("waiting for probe_comp\n"); + host->probe_err = -ENODEV; wait_for_completion(&host->probe_comp); + if (host->probe_err) { + rc = host->probe_err; + goto err_out_free_irq; + } printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n", host->name, pci_name(pdev), (int) CARM_MAX_PORTS, diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 9b3bd083b411..fc4fc951dba7 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -312,7 +312,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) struct request *req; int err; - req = blk_get_request(q, REQ_OP_DRV_IN, 0); + req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -323,7 +323,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) blk_execute_rq(vblk->disk, req, false); err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: - blk_put_request(req); + blk_mq_free_request(req); return err; } @@ -689,28 +689,6 @@ static const struct blk_mq_ops virtio_mq_ops = { static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); -static int virtblk_validate(struct virtio_device *vdev) -{ - u32 blk_size; - - if (!vdev->config->get) { - dev_err(&vdev->dev, "%s failure: config access disabled\n", - __func__); - return -EINVAL; - } - - if (!virtio_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) - return 0; - - blk_size = virtio_cread32(vdev, - offsetof(struct virtio_blk_config, blk_size)); - - if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE) - __virtio_clear_bit(vdev, VIRTIO_BLK_F_BLK_SIZE); - - return 0; -} - static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -722,6 +700,12 @@ static int virtblk_probe(struct virtio_device *vdev) u8 physical_block_exp, alignment_offset; unsigned int queue_depth; + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS), GFP_KERNEL); if (err < 0) @@ -831,19 +815,19 @@ static int virtblk_probe(struct virtio_device *vdev) err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, struct virtio_blk_config, blk_size, &blk_size); - if (!err) + if (!err) { + err = blk_validate_block_size(blk_size); + if (err) { + dev_err(&vdev->dev, + "virtio_blk: invalid block size: 0x%x\n", + blk_size); + goto out_cleanup_disk; + } + blk_queue_logical_block_size(q, blk_size); - else + } else blk_size = queue_logical_block_size(q); - if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE) { - dev_err(&vdev->dev, - "block size is changed unexpectedly, now is %u\n", - blk_size); - err = -EINVAL; - goto out_cleanup_disk; - } - /* Use topology information if available */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, physical_block_exp, @@ -1009,7 +993,6 @@ static struct virtio_driver virtio_blk = { .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, - .validate = virtblk_validate, .probe = virtblk_probe, .remove = virtblk_remove, .config_changed = virtblk_config_changed, diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 33eba3df4dd9..914587aabca0 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -98,7 +98,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) return; } - err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping); + err = sync_blockdev(blkif->vbd.bdev); if (err) { xenbus_dev_error(blkif->be->dev, err, "block flush"); return; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 72902104f111..8e3983e456f3 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -42,6 +42,7 @@ #include <linux/cdrom.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/scatterlist.h> #include <linux/bitmap.h> @@ -2385,7 +2386,13 @@ static void blkfront_connect(struct blkfront_info *info) for_each_rinfo(info, rinfo, i) kick_pending_request_queues(rinfo); - device_add_disk(&info->xbdev->dev, info->gd, NULL); + err = device_add_disk(&info->xbdev->dev, info->gd, NULL); + if (err) { + blk_cleanup_disk(info->gd); + blk_mq_free_tag_set(&info->tag_set); + info->rq = NULL; + goto fail; + } info->is_ready = 1; return; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fcaf2750f68f..a68297fb51a2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1598,22 +1598,18 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) /* * Handler function for all zram I/O requests. */ -static blk_qc_t zram_submit_bio(struct bio *bio) +static void zram_submit_bio(struct bio *bio) { struct zram *zram = bio->bi_bdev->bd_disk->private_data; if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); - goto error; + bio_io_error(bio); + return; } __zram_make_request(zram, bio); - return BLK_QC_T_NONE; - -error: - bio_io_error(bio); - return BLK_QC_T_NONE; } static void zram_slot_free_notify(struct block_device *bdev, diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig index a5b96f3aad67..a4cf3d692dc3 100644 --- a/drivers/bus/Kconfig +++ b/drivers/bus/Kconfig @@ -152,18 +152,6 @@ config QCOM_EBI2 Interface 2, which can be used to connect things like NAND Flash, SRAM, ethernet adapters, FPGAs and LCD displays. -config SIMPLE_PM_BUS - tristate "Simple Power-Managed Bus Driver" - depends on OF && PM - help - Driver for transparent busses that don't need a real driver, but - where the bus controller is part of a PM domain, or under the control - of a functional clock, and thus relies on runtime PM for managing - this PM domain and/or clock. - An example of such a bus controller is the Renesas Bus State - Controller (BSC, sometimes called "LBSC within Bus Bridge", or - "External Bus Interface") as found on several Renesas ARM SoCs. - config SUN50I_DE2_BUS bool "Allwinner A64 DE2 Bus Driver" default ARM64 diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile index 1c29c5e8ffb8..52c2f35a26a9 100644 --- a/drivers/bus/Makefile +++ b/drivers/bus/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_OMAP_OCP2SCP) += omap-ocp2scp.o obj-$(CONFIG_QCOM_EBI2) += qcom-ebi2.o obj-$(CONFIG_SUN50I_DE2_BUS) += sun50i-de2.o obj-$(CONFIG_SUNXI_RSB) += sunxi-rsb.o -obj-$(CONFIG_SIMPLE_PM_BUS) += simple-pm-bus.o +obj-$(CONFIG_OF) += simple-pm-bus.o obj-$(CONFIG_TEGRA_ACONNECT) += tegra-aconnect.o obj-$(CONFIG_TEGRA_GMI) += tegra-gmi.o obj-$(CONFIG_TI_PWMSS) += ti-pwmss.o diff --git a/drivers/bus/simple-pm-bus.c b/drivers/bus/simple-pm-bus.c index 01a3d0cd08ed..6b8d6257ed8a 100644 --- a/drivers/bus/simple-pm-bus.c +++ b/drivers/bus/simple-pm-bus.c @@ -13,11 +13,36 @@ #include <linux/platform_device.h> #include <linux/pm_runtime.h> - static int simple_pm_bus_probe(struct platform_device *pdev) { - const struct of_dev_auxdata *lookup = dev_get_platdata(&pdev->dev); - struct device_node *np = pdev->dev.of_node; + const struct device *dev = &pdev->dev; + const struct of_dev_auxdata *lookup = dev_get_platdata(dev); + struct device_node *np = dev->of_node; + const struct of_device_id *match; + + /* + * Allow user to use driver_override to bind this driver to a + * transparent bus device which has a different compatible string + * that's not listed in simple_pm_bus_of_match. We don't want to do any + * of the simple-pm-bus tasks for these devices, so return early. + */ + if (pdev->driver_override) + return 0; + + match = of_match_device(dev->driver->of_match_table, dev); + /* + * These are transparent bus devices (not simple-pm-bus matches) that + * have their child nodes populated automatically. So, don't need to + * do anything more. We only match with the device if this driver is + * the most specific match because we don't want to incorrectly bind to + * a device that has a more specific driver. + */ + if (match && match->data) { + if (of_property_match_string(np, "compatible", match->compatible) == 0) + return 0; + else + return -ENODEV; + } dev_dbg(&pdev->dev, "%s\n", __func__); @@ -31,14 +56,25 @@ static int simple_pm_bus_probe(struct platform_device *pdev) static int simple_pm_bus_remove(struct platform_device *pdev) { + const void *data = of_device_get_match_data(&pdev->dev); + + if (pdev->driver_override || data) + return 0; + dev_dbg(&pdev->dev, "%s\n", __func__); pm_runtime_disable(&pdev->dev); return 0; } +#define ONLY_BUS ((void *) 1) /* Match if the device is only a bus. */ + static const struct of_device_id simple_pm_bus_of_match[] = { { .compatible = "simple-pm-bus", }, + { .compatible = "simple-bus", .data = ONLY_BUS }, + { .compatible = "simple-mfd", .data = ONLY_BUS }, + { .compatible = "isa", .data = ONLY_BUS }, + { .compatible = "arm,amba-bus", .data = ONLY_BUS }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, simple_pm_bus_of_match); diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index a51c2a8feed9..6a8b7fb5be58 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -1464,6 +1464,9 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = { /* Quirks that need to be set based on detected module */ SYSC_QUIRK("aess", 0, 0, 0x10, -ENODEV, 0x40000000, 0xffffffff, SYSC_MODULE_QUIRK_AESS), + /* Errata i893 handling for dra7 dcan1 and 2 */ + SYSC_QUIRK("dcan", 0x4ae3c000, 0x20, -ENODEV, -ENODEV, 0xa3170504, 0xffffffff, + SYSC_QUIRK_CLKDM_NOAUTO), SYSC_QUIRK("dcan", 0x48480000, 0x20, -ENODEV, -ENODEV, 0xa3170504, 0xffffffff, SYSC_QUIRK_CLKDM_NOAUTO), SYSC_QUIRK("dss", 0x4832a000, 0, 0x10, 0x14, 0x00000020, 0xffffffff, @@ -2954,6 +2957,7 @@ static int sysc_init_soc(struct sysc *ddata) break; case SOC_AM3: sysc_add_disabled(0x48310000); /* rng */ + break; default: break; } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index bd2e5b1560f5..9877e413fce3 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -344,6 +344,12 @@ static void cdrom_sysctl_register(void); static LIST_HEAD(cdrom_list); +static void signal_media_change(struct cdrom_device_info *cdi) +{ + cdi->mc_flags = 0x3; /* set media changed bits, on both queues */ + cdi->last_media_change_ms = ktime_to_ms(ktime_get()); +} + int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, struct packet_command *cgc) { @@ -616,6 +622,7 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi) ENSURE(cdo, generic_packet, CDC_GENERIC_PACKET); cdi->mc_flags = 0; cdi->options = CDO_USE_FFLAGS; + cdi->last_media_change_ms = ktime_to_ms(ktime_get()); if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY)) cdi->options |= (int) CDO_AUTO_CLOSE; @@ -864,7 +871,7 @@ static void cdrom_mmc3_profile(struct cdrom_device_info *cdi) { struct packet_command cgc; char buffer[32]; - int ret, mmc3_profile; + int mmc3_profile; init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); @@ -874,7 +881,7 @@ static void cdrom_mmc3_profile(struct cdrom_device_info *cdi) cgc.cmd[8] = sizeof(buffer); /* Allocation Length */ cgc.quiet = 1; - if ((ret = cdi->ops->generic_packet(cdi, &cgc))) + if (cdi->ops->generic_packet(cdi, &cgc)) mmc3_profile = 0xffff; else mmc3_profile = (buffer[6] << 8) | buffer[7]; @@ -1421,8 +1428,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot) cdi->ops->check_events(cdi, 0, slot); if (slot == CDSL_NONE) { - /* set media changed bits, on both queues */ - cdi->mc_flags = 0x3; + signal_media_change(cdi); return cdrom_load_unload(cdi, -1); } @@ -1455,7 +1461,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot) slot = curslot; /* set media changed bits on both queues */ - cdi->mc_flags = 0x3; + signal_media_change(cdi); if ((ret = cdrom_load_unload(cdi, slot))) return ret; @@ -1521,7 +1527,7 @@ int media_changed(struct cdrom_device_info *cdi, int queue) cdi->ioctl_events = 0; if (changed) { - cdi->mc_flags = 0x3; /* set bit on both queues */ + signal_media_change(cdi); ret |= 1; cdi->media_written = 0; } @@ -2336,6 +2342,49 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi, return ret; } +/* + * Media change detection with timing information. + * + * arg is a pointer to a cdrom_timed_media_change_info struct. + * arg->last_media_change may be set by calling code to signal + * the timestamp (in ms) of the last known media change (by the caller). + * Upon successful return, ioctl call will set arg->last_media_change + * to the latest media change timestamp known by the kernel/driver + * and set arg->has_changed to 1 if that timestamp is more recent + * than the timestamp set by the caller. + */ +static int cdrom_ioctl_timed_media_change(struct cdrom_device_info *cdi, + unsigned long arg) +{ + int ret; + struct cdrom_timed_media_change_info __user *info; + struct cdrom_timed_media_change_info tmp_info; + + if (!CDROM_CAN(CDC_MEDIA_CHANGED)) + return -ENOSYS; + + info = (struct cdrom_timed_media_change_info __user *)arg; + cd_dbg(CD_DO_IOCTL, "entering CDROM_TIMED_MEDIA_CHANGE\n"); + + ret = cdrom_ioctl_media_changed(cdi, CDSL_CURRENT); + if (ret < 0) + return ret; + + if (copy_from_user(&tmp_info, info, sizeof(tmp_info)) != 0) + return -EFAULT; + + tmp_info.media_flags = 0; + if (tmp_info.last_media_change - cdi->last_media_change_ms < 0) + tmp_info.media_flags |= MEDIA_CHANGED_FLAG; + + tmp_info.last_media_change = cdi->last_media_change_ms; + + if (copy_to_user(info, &tmp_info, sizeof(*info)) != 0) + return -EFAULT; + + return 0; +} + static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi, unsigned long arg) { @@ -3313,6 +3362,8 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, return cdrom_ioctl_eject_sw(cdi, arg); case CDROM_MEDIA_CHANGED: return cdrom_ioctl_media_changed(cdi, arg); + case CDROM_TIMED_MEDIA_CHANGE: + return cdrom_ioctl_timed_media_change(cdi, arg); case CDROM_SET_OPTIONS: return cdrom_ioctl_set_options(cdi, arg); case CDROM_CLEAR_OPTIONS: diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 8e1fe75af93f..d50cc1fd34d5 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -805,9 +805,14 @@ static int probe_gdrom(struct platform_device *devptr) err = -ENOMEM; goto probe_fail_free_irqs; } - add_disk(gd.disk); + err = add_disk(gd.disk); + if (err) + goto probe_fail_add_disk; + return 0; +probe_fail_add_disk: + kfree(gd.toc); probe_fail_free_irqs: free_irq(HW_EVENT_GDROM_DMA, &gd); free_irq(HW_EVENT_GDROM_CMD, &gd); diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index d6ba644f6b00..4a5516406c22 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -76,7 +76,7 @@ config TCG_TIS_SPI_CR50 config TCG_TIS_SYNQUACER tristate "TPM Interface Specification 1.2 Interface / TPM 2.0 FIFO Interface (MMIO - SynQuacer)" - depends on ARCH_SYNQUACER + depends on ARCH_SYNQUACER || COMPILE_TEST select TCG_TIS_CORE help If you have a TPM security chip that is compliant with the diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c index 784b8b3cb903..97e916856cf3 100644 --- a/drivers/char/tpm/tpm2-space.c +++ b/drivers/char/tpm/tpm2-space.c @@ -455,6 +455,9 @@ static int tpm2_map_response_body(struct tpm_chip *chip, u32 cc, u8 *rsp, if (be32_to_cpu(data->capability) != TPM2_CAP_HANDLES) return 0; + if (be32_to_cpu(data->count) > (UINT_MAX - TPM_HEADER_SIZE - 9) / 4) + return -EFAULT; + if (len != TPM_HEADER_SIZE + 9 + 4 * be32_to_cpu(data->count)) return -EFAULT; diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 69579efb247b..b2659a4c4016 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -48,6 +48,7 @@ static int wait_for_tpm_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout, wait_queue_head_t *queue, bool check_cancel) { + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); unsigned long stop; long rc; u8 status; @@ -80,8 +81,8 @@ again: } } else { do { - usleep_range(TPM_TIMEOUT_USECS_MIN, - TPM_TIMEOUT_USECS_MAX); + usleep_range(priv->timeout_min, + priv->timeout_max); status = chip->ops->status(chip); if ((status & mask) == mask) return 0; @@ -945,7 +946,22 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, chip->timeout_b = msecs_to_jiffies(TIS_TIMEOUT_B_MAX); chip->timeout_c = msecs_to_jiffies(TIS_TIMEOUT_C_MAX); chip->timeout_d = msecs_to_jiffies(TIS_TIMEOUT_D_MAX); + priv->timeout_min = TPM_TIMEOUT_USECS_MIN; + priv->timeout_max = TPM_TIMEOUT_USECS_MAX; priv->phy_ops = phy_ops; + + rc = tpm_tis_read32(priv, TPM_DID_VID(0), &vendor); + if (rc < 0) + goto out_err; + + priv->manufacturer_id = vendor; + + if (priv->manufacturer_id == TPM_VID_ATML && + !(chip->flags & TPM_CHIP_FLAG_TPM2)) { + priv->timeout_min = TIS_TIMEOUT_MIN_ATML; + priv->timeout_max = TIS_TIMEOUT_MAX_ATML; + } + dev_set_drvdata(&chip->dev, priv); if (is_bsw()) { @@ -988,12 +1004,6 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, if (rc) goto out_err; - rc = tpm_tis_read32(priv, TPM_DID_VID(0), &vendor); - if (rc < 0) - goto out_err; - - priv->manufacturer_id = vendor; - rc = tpm_tis_read8(priv, TPM_RID(0), &rid); if (rc < 0) goto out_err; diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index b2a3c6c72882..3be24f221e32 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -54,6 +54,8 @@ enum tis_defaults { TIS_MEM_LEN = 0x5000, TIS_SHORT_TIMEOUT = 750, /* ms */ TIS_LONG_TIMEOUT = 2000, /* 2 sec */ + TIS_TIMEOUT_MIN_ATML = 14700, /* usecs */ + TIS_TIMEOUT_MAX_ATML = 15000, /* usecs */ }; /* Some timeout values are needed before it is known whether the chip is @@ -98,6 +100,8 @@ struct tpm_tis_data { wait_queue_head_t read_queue; const struct tpm_tis_phy_ops *phy_ops; unsigned short rng_quality; + unsigned int timeout_min; /* usecs */ + unsigned int timeout_max; /* usecs */ }; struct tpm_tis_phy_ops { diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c index 54584b4b00d1..aaa59a00eeae 100644 --- a/drivers/char/tpm/tpm_tis_spi_main.c +++ b/drivers/char/tpm/tpm_tis_spi_main.c @@ -267,6 +267,7 @@ static const struct spi_device_id tpm_tis_spi_id[] = { { "st33htpm-spi", (unsigned long)tpm_tis_spi_probe }, { "slb9670", (unsigned long)tpm_tis_spi_probe }, { "tpm_tis_spi", (unsigned long)tpm_tis_spi_probe }, + { "tpm_tis-spi", (unsigned long)tpm_tis_spi_probe }, { "cr50", (unsigned long)cr50_spi_probe }, {} }; diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c index 0506046a5f4b..510a9965633b 100644 --- a/drivers/clk/clk-composite.c +++ b/drivers/clk/clk-composite.c @@ -58,11 +58,8 @@ static int clk_composite_determine_rate(struct clk_hw *hw, long rate; int i; - if (rate_hw && rate_ops && rate_ops->determine_rate) { - __clk_hw_set_clk(rate_hw, hw); - return rate_ops->determine_rate(rate_hw, req); - } else if (rate_hw && rate_ops && rate_ops->round_rate && - mux_hw && mux_ops && mux_ops->set_parent) { + if (rate_hw && rate_ops && rate_ops->round_rate && + mux_hw && mux_ops && mux_ops->set_parent) { req->best_parent_hw = NULL; if (clk_hw_get_flags(hw) & CLK_SET_RATE_NO_REPARENT) { @@ -107,6 +104,9 @@ static int clk_composite_determine_rate(struct clk_hw *hw, req->rate = best_rate; return 0; + } else if (rate_hw && rate_ops && rate_ops->determine_rate) { + __clk_hw_set_clk(rate_hw, hw); + return rate_ops->determine_rate(rate_hw, req); } else if (mux_hw && mux_ops && mux_ops->determine_rate) { __clk_hw_set_clk(mux_hw, hw); return mux_ops->determine_rate(mux_hw, req); diff --git a/drivers/clk/qcom/Kconfig b/drivers/clk/qcom/Kconfig index 0a5596797b93..9ef007b3cf9b 100644 --- a/drivers/clk/qcom/Kconfig +++ b/drivers/clk/qcom/Kconfig @@ -564,6 +564,7 @@ config SM_GCC_6125 config SM_GCC_6350 tristate "SM6350 Global Clock Controller" + select QCOM_GDSC help Support for the global clock controller on SM6350 devices. Say Y if you want to use peripheral devices such as UART, diff --git a/drivers/clk/qcom/gcc-sm6115.c b/drivers/clk/qcom/gcc-sm6115.c index bc09736ece76..68fe9f6f0d2f 100644 --- a/drivers/clk/qcom/gcc-sm6115.c +++ b/drivers/clk/qcom/gcc-sm6115.c @@ -3242,7 +3242,7 @@ static struct gdsc hlos1_vote_turing_mmu_tbu1_gdsc = { }; static struct gdsc hlos1_vote_turing_mmu_tbu0_gdsc = { - .gdscr = 0x7d060, + .gdscr = 0x7d07c, .pd = { .name = "hlos1_vote_turing_mmu_tbu0", }, diff --git a/drivers/clk/renesas/r9a07g044-cpg.c b/drivers/clk/renesas/r9a07g044-cpg.c index 4c94b94c4125..1490446985e2 100644 --- a/drivers/clk/renesas/r9a07g044-cpg.c +++ b/drivers/clk/renesas/r9a07g044-cpg.c @@ -186,6 +186,8 @@ static struct rzg2l_reset r9a07g044_resets[] = { static const unsigned int r9a07g044_crit_mod_clks[] __initconst = { MOD_CLK_BASE + R9A07G044_GIC600_GICCLK, + MOD_CLK_BASE + R9A07G044_IA55_CLK, + MOD_CLK_BASE + R9A07G044_DMAC_ACLK, }; const struct rzg2l_cpg_info r9a07g044_cpg_info = { diff --git a/drivers/clk/renesas/rzg2l-cpg.c b/drivers/clk/renesas/rzg2l-cpg.c index 3b3b2c3347f3..761922ea5db7 100644 --- a/drivers/clk/renesas/rzg2l-cpg.c +++ b/drivers/clk/renesas/rzg2l-cpg.c @@ -391,7 +391,7 @@ static int rzg2l_mod_clock_is_enabled(struct clk_hw *hw) value = readl(priv->base + CLK_MON_R(clock->off)); - return !(value & bitmask); + return value & bitmask; } static const struct clk_ops rzg2l_mod_clock_ops = { diff --git a/drivers/clk/socfpga/clk-agilex.c b/drivers/clk/socfpga/clk-agilex.c index 242e94c0cf8a..bf8cd928c228 100644 --- a/drivers/clk/socfpga/clk-agilex.c +++ b/drivers/clk/socfpga/clk-agilex.c @@ -165,13 +165,6 @@ static const struct clk_parent_data mpu_mux[] = { .name = "boot_clk", }, }; -static const struct clk_parent_data s2f_usr0_mux[] = { - { .fw_name = "f2s-free-clk", - .name = "f2s-free-clk", }, - { .fw_name = "boot_clk", - .name = "boot_clk", }, -}; - static const struct clk_parent_data emac_mux[] = { { .fw_name = "emaca_free_clk", .name = "emaca_free_clk", }, @@ -312,8 +305,6 @@ static const struct stratix10_gate_clock agilex_gate_clks[] = { 4, 0x44, 28, 1, 0, 0, 0}, { AGILEX_CS_TIMER_CLK, "cs_timer_clk", NULL, noc_mux, ARRAY_SIZE(noc_mux), 0, 0x24, 5, 0, 0, 0, 0x30, 1, 0}, - { AGILEX_S2F_USER0_CLK, "s2f_user0_clk", NULL, s2f_usr0_mux, ARRAY_SIZE(s2f_usr0_mux), 0, 0x24, - 6, 0, 0, 0, 0, 0, 0}, { AGILEX_EMAC0_CLK, "emac0_clk", NULL, emac_mux, ARRAY_SIZE(emac_mux), 0, 0x7C, 0, 0, 0, 0, 0x94, 26, 0}, { AGILEX_EMAC1_CLK, "emac1_clk", NULL, emac_mux, ARRAY_SIZE(emac_mux), 0, 0x7C, diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 0f5e3983951a..f65e31bab9ae 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -24,6 +24,7 @@ config I8253_LOCK config OMAP_DM_TIMER bool + select TIMER_OF config CLKBLD_I8253 def_bool y if CLKSRC_I8253 || CLKEVT_I8253 || I8253_LOCK @@ -418,12 +419,14 @@ config ATMEL_TCB_CLKSRC config CLKSRC_EXYNOS_MCT bool "Exynos multi core timer driver" if COMPILE_TEST depends on ARM || ARM64 + depends on ARCH_EXYNOS || COMPILE_TEST help Support for Multi Core Timer controller on Exynos SoCs. config CLKSRC_SAMSUNG_PWM bool "PWM timer driver for Samsung S3C, S5P" if COMPILE_TEST depends on HAS_IOMEM + depends on ARCH_EXYNOS || ARCH_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210 || COMPILE_TEST help This is a new clocksource driver for the PWM timer found in Samsung S3C, S5P and Exynos SoCs, replacing an earlier driver diff --git a/drivers/clocksource/arc_timer.c b/drivers/clocksource/arc_timer.c index de93dd1a8c7b..cb18524cc13d 100644 --- a/drivers/clocksource/arc_timer.c +++ b/drivers/clocksource/arc_timer.c @@ -225,7 +225,7 @@ static int __init arc_cs_setup_timer1(struct device_node *node) write_aux_reg(ARC_REG_TIMER1_LIMIT, ARC_TIMERN_MAX); write_aux_reg(ARC_REG_TIMER1_CNT, 0); - write_aux_reg(ARC_REG_TIMER1_CTRL, TIMER_CTRL_NH); + write_aux_reg(ARC_REG_TIMER1_CTRL, ARC_TIMER_CTRL_NH); sched_clock_register(arc_timer1_clock_read, 32, arc_timer_freq); @@ -245,7 +245,7 @@ static void arc_timer_event_setup(unsigned int cycles) write_aux_reg(ARC_REG_TIMER0_LIMIT, cycles); write_aux_reg(ARC_REG_TIMER0_CNT, 0); /* start from 0 */ - write_aux_reg(ARC_REG_TIMER0_CTRL, TIMER_CTRL_IE | TIMER_CTRL_NH); + write_aux_reg(ARC_REG_TIMER0_CTRL, ARC_TIMER_CTRL_IE | ARC_TIMER_CTRL_NH); } @@ -294,7 +294,7 @@ static irqreturn_t timer_irq_handler(int irq, void *dev_id) * explicitly clears IP bit * 2. Re-arm interrupt if periodic by writing to IE bit [0] */ - write_aux_reg(ARC_REG_TIMER0_CTRL, irq_reenable | TIMER_CTRL_NH); + write_aux_reg(ARC_REG_TIMER0_CTRL, irq_reenable | ARC_TIMER_CTRL_NH); evt->event_handler(evt); diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index be6d741d404c..9a04eacc4412 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -44,23 +44,29 @@ #define CNTACR_RWVT BIT(4) #define CNTACR_RWPT BIT(5) -#define CNTVCT_LO 0x08 -#define CNTVCT_HI 0x0c +#define CNTVCT_LO 0x00 +#define CNTPCT_LO 0x08 #define CNTFRQ 0x10 -#define CNTP_TVAL 0x28 +#define CNTP_CVAL_LO 0x20 #define CNTP_CTL 0x2c -#define CNTV_TVAL 0x38 +#define CNTV_CVAL_LO 0x30 #define CNTV_CTL 0x3c -static unsigned arch_timers_present __initdata; +/* + * The minimum amount of time a generic counter is guaranteed to not roll over + * (40 years) + */ +#define MIN_ROLLOVER_SECS (40ULL * 365 * 24 * 3600) -static void __iomem *arch_counter_base __ro_after_init; +static unsigned arch_timers_present __initdata; struct arch_timer { void __iomem *base; struct clock_event_device evt; }; +static struct arch_timer *arch_timer_mem __ro_after_init; + #define to_arch_timer(e) container_of(e, struct arch_timer, evt) static u32 arch_timer_rate __ro_after_init; @@ -96,32 +102,57 @@ static int __init early_evtstrm_cfg(char *buf) early_param("clocksource.arm_arch_timer.evtstrm", early_evtstrm_cfg); /* + * Makes an educated guess at a valid counter width based on the Generic Timer + * specification. Of note: + * 1) the system counter is at least 56 bits wide + * 2) a roll-over time of not less than 40 years + * + * See 'ARM DDI 0487G.a D11.1.2 ("The system counter")' for more details. + */ +static int arch_counter_get_width(void) +{ + u64 min_cycles = MIN_ROLLOVER_SECS * arch_timer_rate; + + /* guarantee the returned width is within the valid range */ + return clamp_val(ilog2(min_cycles - 1) + 1, 56, 64); +} + +/* * Architected system timer support. */ static __always_inline -void arch_timer_reg_write(int access, enum arch_timer_reg reg, u32 val, +void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val, struct clock_event_device *clk) { if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: - writel_relaxed(val, timer->base + CNTP_CTL); + writel_relaxed((u32)val, timer->base + CNTP_CTL); break; - case ARCH_TIMER_REG_TVAL: - writel_relaxed(val, timer->base + CNTP_TVAL); + case ARCH_TIMER_REG_CVAL: + /* + * Not guaranteed to be atomic, so the timer + * must be disabled at this point. + */ + writeq_relaxed(val, timer->base + CNTP_CVAL_LO); break; + default: + BUILD_BUG(); } } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: - writel_relaxed(val, timer->base + CNTV_CTL); + writel_relaxed((u32)val, timer->base + CNTV_CTL); break; - case ARCH_TIMER_REG_TVAL: - writel_relaxed(val, timer->base + CNTV_TVAL); + case ARCH_TIMER_REG_CVAL: + /* Same restriction as above */ + writeq_relaxed(val, timer->base + CNTV_CVAL_LO); break; + default: + BUILD_BUG(); } } else { arch_timer_reg_write_cp15(access, reg, val); @@ -140,9 +171,8 @@ u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, case ARCH_TIMER_REG_CTRL: val = readl_relaxed(timer->base + CNTP_CTL); break; - case ARCH_TIMER_REG_TVAL: - val = readl_relaxed(timer->base + CNTP_TVAL); - break; + default: + BUILD_BUG(); } } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); @@ -150,9 +180,8 @@ u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, case ARCH_TIMER_REG_CTRL: val = readl_relaxed(timer->base + CNTV_CTL); break; - case ARCH_TIMER_REG_TVAL: - val = readl_relaxed(timer->base + CNTV_TVAL); - break; + default: + BUILD_BUG(); } } else { val = arch_timer_reg_read_cp15(access, reg); @@ -205,13 +234,11 @@ static struct clocksource clocksource_counter = { .id = CSID_ARM_ARCH_COUNTER, .rating = 400, .read = arch_counter_read, - .mask = CLOCKSOURCE_MASK(56), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static struct cyclecounter cyclecounter __ro_after_init = { .read = arch_counter_read_cc, - .mask = CLOCKSOURCE_MASK(56), }; struct ate_acpi_oem_info { @@ -239,16 +266,6 @@ struct ate_acpi_oem_info { _new; \ }) -static u32 notrace fsl_a008585_read_cntp_tval_el0(void) -{ - return __fsl_a008585_read_reg(cntp_tval_el0); -} - -static u32 notrace fsl_a008585_read_cntv_tval_el0(void) -{ - return __fsl_a008585_read_reg(cntv_tval_el0); -} - static u64 notrace fsl_a008585_read_cntpct_el0(void) { return __fsl_a008585_read_reg(cntpct_el0); @@ -285,16 +302,6 @@ static u64 notrace fsl_a008585_read_cntvct_el0(void) _new; \ }) -static u32 notrace hisi_161010101_read_cntp_tval_el0(void) -{ - return __hisi_161010101_read_reg(cntp_tval_el0); -} - -static u32 notrace hisi_161010101_read_cntv_tval_el0(void) -{ - return __hisi_161010101_read_reg(cntv_tval_el0); -} - static u64 notrace hisi_161010101_read_cntpct_el0(void) { return __hisi_161010101_read_reg(cntpct_el0); @@ -379,16 +386,6 @@ static u64 notrace sun50i_a64_read_cntvct_el0(void) { return __sun50i_a64_read_reg(cntvct_el0); } - -static u32 notrace sun50i_a64_read_cntp_tval_el0(void) -{ - return read_sysreg(cntp_cval_el0) - sun50i_a64_read_cntpct_el0(); -} - -static u32 notrace sun50i_a64_read_cntv_tval_el0(void) -{ - return read_sysreg(cntv_cval_el0) - sun50i_a64_read_cntvct_el0(); -} #endif #ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND @@ -397,7 +394,7 @@ EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround); static atomic_t timer_unstable_counter_workaround_in_use = ATOMIC_INIT(0); -static void erratum_set_next_event_tval_generic(const int access, unsigned long evt, +static void erratum_set_next_event_generic(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; @@ -418,17 +415,17 @@ static void erratum_set_next_event_tval_generic(const int access, unsigned long arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } -static __maybe_unused int erratum_set_next_event_tval_virt(unsigned long evt, +static __maybe_unused int erratum_set_next_event_virt(unsigned long evt, struct clock_event_device *clk) { - erratum_set_next_event_tval_generic(ARCH_TIMER_VIRT_ACCESS, evt, clk); + erratum_set_next_event_generic(ARCH_TIMER_VIRT_ACCESS, evt, clk); return 0; } -static __maybe_unused int erratum_set_next_event_tval_phys(unsigned long evt, +static __maybe_unused int erratum_set_next_event_phys(unsigned long evt, struct clock_event_device *clk) { - erratum_set_next_event_tval_generic(ARCH_TIMER_PHYS_ACCESS, evt, clk); + erratum_set_next_event_generic(ARCH_TIMER_PHYS_ACCESS, evt, clk); return 0; } @@ -438,12 +435,10 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = { .match_type = ate_match_dt, .id = "fsl,erratum-a008585", .desc = "Freescale erratum a005858", - .read_cntp_tval_el0 = fsl_a008585_read_cntp_tval_el0, - .read_cntv_tval_el0 = fsl_a008585_read_cntv_tval_el0, .read_cntpct_el0 = fsl_a008585_read_cntpct_el0, .read_cntvct_el0 = fsl_a008585_read_cntvct_el0, - .set_next_event_phys = erratum_set_next_event_tval_phys, - .set_next_event_virt = erratum_set_next_event_tval_virt, + .set_next_event_phys = erratum_set_next_event_phys, + .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_HISILICON_ERRATUM_161010101 @@ -451,23 +446,19 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = { .match_type = ate_match_dt, .id = "hisilicon,erratum-161010101", .desc = "HiSilicon erratum 161010101", - .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0, - .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0, .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, - .set_next_event_phys = erratum_set_next_event_tval_phys, - .set_next_event_virt = erratum_set_next_event_tval_virt, + .set_next_event_phys = erratum_set_next_event_phys, + .set_next_event_virt = erratum_set_next_event_virt, }, { .match_type = ate_match_acpi_oem_info, .id = hisi_161010101_oem_info, .desc = "HiSilicon erratum 161010101", - .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0, - .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0, .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, - .set_next_event_phys = erratum_set_next_event_tval_phys, - .set_next_event_virt = erratum_set_next_event_tval_virt, + .set_next_event_phys = erratum_set_next_event_phys, + .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_ARM64_ERRATUM_858921 @@ -484,12 +475,10 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = { .match_type = ate_match_dt, .id = "allwinner,erratum-unknown1", .desc = "Allwinner erratum UNKNOWN1", - .read_cntp_tval_el0 = sun50i_a64_read_cntp_tval_el0, - .read_cntv_tval_el0 = sun50i_a64_read_cntv_tval_el0, .read_cntpct_el0 = sun50i_a64_read_cntpct_el0, .read_cntvct_el0 = sun50i_a64_read_cntvct_el0, - .set_next_event_phys = erratum_set_next_event_tval_phys, - .set_next_event_virt = erratum_set_next_event_tval_virt, + .set_next_event_phys = erratum_set_next_event_phys, + .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_ARM64_ERRATUM_1418040 @@ -727,10 +716,18 @@ static __always_inline void set_next_event(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; + u64 cnt; + ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; - arch_timer_reg_write(access, ARCH_TIMER_REG_TVAL, evt, clk); + + if (access == ARCH_TIMER_PHYS_ACCESS) + cnt = __arch_counter_get_cntpct(); + else + cnt = __arch_counter_get_cntvct(); + + arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } @@ -748,23 +745,79 @@ static int arch_timer_set_next_event_phys(unsigned long evt, return 0; } +static u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) +{ + u32 cnt_lo, cnt_hi, tmp_hi; + + do { + cnt_hi = readl_relaxed(t->base + offset_lo + 4); + cnt_lo = readl_relaxed(t->base + offset_lo); + tmp_hi = readl_relaxed(t->base + offset_lo + 4); + } while (cnt_hi != tmp_hi); + + return ((u64) cnt_hi << 32) | cnt_lo; +} + +static __always_inline void set_next_event_mem(const int access, unsigned long evt, + struct clock_event_device *clk) +{ + struct arch_timer *timer = to_arch_timer(clk); + unsigned long ctrl; + u64 cnt; + + ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl |= ARCH_TIMER_CTRL_ENABLE; + ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; + + if (access == ARCH_TIMER_MEM_VIRT_ACCESS) + cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO); + else + cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO); + + arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); + arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); +} + static int arch_timer_set_next_event_virt_mem(unsigned long evt, struct clock_event_device *clk) { - set_next_event(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); + set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); return 0; } static int arch_timer_set_next_event_phys_mem(unsigned long evt, struct clock_event_device *clk) { - set_next_event(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); + set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); return 0; } +static u64 __arch_timer_check_delta(void) +{ +#ifdef CONFIG_ARM64 + const struct midr_range broken_cval_midrs[] = { + /* + * XGene-1 implements CVAL in terms of TVAL, meaning + * that the maximum timer range is 32bit. Shame on them. + */ + MIDR_ALL_VERSIONS(MIDR_CPU_MODEL(ARM_CPU_IMP_APM, + APM_CPU_PART_POTENZA)), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), broken_cval_midrs)) { + pr_warn_once("Broken CNTx_CVAL_EL1, limiting width to 32bits"); + return CLOCKSOURCE_MASK(32); + } +#endif + return CLOCKSOURCE_MASK(arch_counter_get_width()); +} + static void __arch_timer_setup(unsigned type, struct clock_event_device *clk) { + u64 max_delta; + clk->features = CLOCK_EVT_FEAT_ONESHOT; if (type == ARCH_TIMER_TYPE_CP15) { @@ -796,6 +849,7 @@ static void __arch_timer_setup(unsigned type, } clk->set_next_event = sne; + max_delta = __arch_timer_check_delta(); } else { clk->features |= CLOCK_EVT_FEAT_DYNIRQ; clk->name = "arch_mem_timer"; @@ -812,11 +866,13 @@ static void __arch_timer_setup(unsigned type, clk->set_next_event = arch_timer_set_next_event_phys_mem; } + + max_delta = CLOCKSOURCE_MASK(56); } clk->set_state_shutdown(clk); - clockevents_config_and_register(clk, arch_timer_rate, 0xf, 0x7fffffff); + clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta); } static void arch_timer_evtstrm_enable(int divider) @@ -986,15 +1042,7 @@ bool arch_timer_evtstrm_available(void) static u64 arch_counter_get_cntvct_mem(void) { - u32 vct_lo, vct_hi, tmp_hi; - - do { - vct_hi = readl_relaxed(arch_counter_base + CNTVCT_HI); - vct_lo = readl_relaxed(arch_counter_base + CNTVCT_LO); - tmp_hi = readl_relaxed(arch_counter_base + CNTVCT_HI); - } while (vct_hi != tmp_hi); - - return ((u64) vct_hi << 32) | vct_lo; + return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); } static struct arch_timer_kvm_info arch_timer_kvm_info; @@ -1007,6 +1055,7 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) static void __init arch_counter_register(unsigned type) { u64 start_count; + int width; /* Register the CP15 based counter if we have one */ if (type & ARCH_TIMER_TYPE_CP15) { @@ -1031,6 +1080,10 @@ static void __init arch_counter_register(unsigned type) arch_timer_read_counter = arch_counter_get_cntvct_mem; } + width = arch_counter_get_width(); + clocksource_counter.mask = CLOCKSOURCE_MASK(width); + cyclecounter.mask = CLOCKSOURCE_MASK(width); + if (!arch_counter_suspend_stop) clocksource_counter.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; start_count = arch_timer_read_counter(); @@ -1040,8 +1093,7 @@ static void __init arch_counter_register(unsigned type) timecounter_init(&arch_timer_kvm_info.timecounter, &cyclecounter, start_count); - /* 56 bits minimum, so we assume worst case rollover */ - sched_clock_register(arch_timer_read_counter, 56, arch_timer_rate); + sched_clock_register(arch_timer_read_counter, width, arch_timer_rate); } static void arch_timer_stop(struct clock_event_device *clk) @@ -1182,25 +1234,25 @@ static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq) { int ret; irq_handler_t func; - struct arch_timer *t; - t = kzalloc(sizeof(*t), GFP_KERNEL); - if (!t) + arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL); + if (!arch_timer_mem) return -ENOMEM; - t->base = base; - t->evt.irq = irq; - __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &t->evt); + arch_timer_mem->base = base; + arch_timer_mem->evt.irq = irq; + __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt); if (arch_timer_mem_use_virtual) func = arch_timer_handler_virt_mem; else func = arch_timer_handler_phys_mem; - ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &t->evt); + ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt); if (ret) { pr_err("Failed to request mem timer irq\n"); - kfree(t); + kfree(arch_timer_mem); + arch_timer_mem = NULL; } return ret; @@ -1458,7 +1510,6 @@ arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) return ret; } - arch_counter_base = base; arch_timers_present |= ARCH_TIMER_TYPE_MEM; return 0; diff --git a/drivers/edac/armada_xp_edac.c b/drivers/edac/armada_xp_edac.c index e3e757513d1b..b1f46a974b9e 100644 --- a/drivers/edac/armada_xp_edac.c +++ b/drivers/edac/armada_xp_edac.c @@ -178,7 +178,7 @@ static void axp_mc_check(struct mem_ctl_info *mci) "details unavailable (multiple errors)"); if (cnt_dbe) edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, - cnt_sbe, /* error count */ + cnt_dbe, /* error count */ 0, 0, 0, /* pfn, offset, syndrome */ -1, -1, -1, /* top, mid, low layer */ mci->ctl_name, diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 220a58cf0a44..cda7d7162cbb 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -203,10 +203,7 @@ config INTEL_STRATIX10_RSU Say Y here if you want Intel RSU support. config QCOM_SCM - tristate "Qcom SCM driver" - depends on ARM || ARM64 - depends on HAVE_ARM_SMCCC - select RESET_CONTROLLER + tristate config QCOM_SCM_DOWNLOAD_MODE_DEFAULT bool "Qualcomm download mode enabled by default" diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index 00fe595a5bc8..641a91819088 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -49,6 +49,13 @@ static int ffa_device_probe(struct device *dev) return ffa_drv->probe(ffa_dev); } +static void ffa_device_remove(struct device *dev) +{ + struct ffa_driver *ffa_drv = to_ffa_driver(dev->driver); + + ffa_drv->remove(to_ffa_dev(dev)); +} + static int ffa_device_uevent(struct device *dev, struct kobj_uevent_env *env) { struct ffa_device *ffa_dev = to_ffa_dev(dev); @@ -86,6 +93,7 @@ struct bus_type ffa_bus_type = { .name = "arm_ffa", .match = ffa_device_match, .probe = ffa_device_probe, + .remove = ffa_device_remove, .uevent = ffa_device_uevent, .dev_groups = ffa_device_attributes_groups, }; @@ -127,7 +135,7 @@ static void ffa_release_device(struct device *dev) static int __ffa_devices_unregister(struct device *dev, void *data) { - ffa_release_device(dev); + device_unregister(dev); return 0; } diff --git a/drivers/firmware/arm_scmi/Kconfig b/drivers/firmware/arm_scmi/Kconfig index 7f4d2435503b..3d7081e84853 100644 --- a/drivers/firmware/arm_scmi/Kconfig +++ b/drivers/firmware/arm_scmi/Kconfig @@ -68,7 +68,7 @@ config ARM_SCMI_TRANSPORT_SMC config ARM_SCMI_TRANSPORT_VIRTIO bool "SCMI transport based on VirtIO" - depends on VIRTIO + depends on VIRTIO=y || VIRTIO=ARM_SCMI_PROTOCOL select ARM_SCMI_HAVE_TRANSPORT select ARM_SCMI_HAVE_MSG help diff --git a/drivers/firmware/arm_scmi/virtio.c b/drivers/firmware/arm_scmi/virtio.c index 224577f86928..11e8efb71375 100644 --- a/drivers/firmware/arm_scmi/virtio.c +++ b/drivers/firmware/arm_scmi/virtio.c @@ -110,18 +110,16 @@ static void scmi_finalize_message(struct scmi_vio_channel *vioch, if (vioch->is_rx) { scmi_vio_feed_vq_rx(vioch, msg); } else { - unsigned long flags; - - spin_lock_irqsave(&vioch->lock, flags); + /* Here IRQs are assumed to be already disabled by the caller */ + spin_lock(&vioch->lock); list_add(&msg->list, &vioch->free_list); - spin_unlock_irqrestore(&vioch->lock, flags); + spin_unlock(&vioch->lock); } } static void scmi_vio_complete_cb(struct virtqueue *vqueue) { unsigned long ready_flags; - unsigned long flags; unsigned int length; struct scmi_vio_channel *vioch; struct scmi_vio_msg *msg; @@ -140,7 +138,8 @@ static void scmi_vio_complete_cb(struct virtqueue *vqueue) goto unlock_ready_out; } - spin_lock_irqsave(&vioch->lock, flags); + /* IRQs already disabled here no need to irqsave */ + spin_lock(&vioch->lock); if (cb_enabled) { virtqueue_disable_cb(vqueue); cb_enabled = false; @@ -151,7 +150,7 @@ static void scmi_vio_complete_cb(struct virtqueue *vqueue) goto unlock_out; cb_enabled = true; } - spin_unlock_irqrestore(&vioch->lock, flags); + spin_unlock(&vioch->lock); if (msg) { msg->rx_len = length; @@ -161,11 +160,18 @@ static void scmi_vio_complete_cb(struct virtqueue *vqueue) scmi_finalize_message(vioch, msg); } + /* + * Release ready_lock and re-enable IRQs between loop iterations + * to allow virtio_chan_free() to possibly kick in and set the + * flag vioch->ready to false even in between processing of + * messages, so as to force outstanding messages to be ignored + * when system is shutting down. + */ spin_unlock_irqrestore(&vioch->ready_lock, ready_flags); } unlock_out: - spin_unlock_irqrestore(&vioch->lock, flags); + spin_unlock(&vioch->lock); unlock_ready_out: spin_unlock_irqrestore(&vioch->ready_lock, ready_flags); } @@ -384,8 +390,11 @@ static int scmi_vio_probe(struct virtio_device *vdev) struct virtqueue *vqs[VIRTIO_SCMI_VQ_MAX_CNT]; /* Only one SCMI VirtiO device allowed */ - if (scmi_vdev) - return -EINVAL; + if (scmi_vdev) { + dev_err(dev, + "One SCMI Virtio device was already initialized: only one allowed.\n"); + return -EBUSY; + } have_vq_rx = scmi_vio_have_vq_rx(vdev); vq_cnt = have_vq_rx ? VIRTIO_SCMI_VQ_MAX_CNT : 1; @@ -428,16 +437,25 @@ static int scmi_vio_probe(struct virtio_device *vdev) } vdev->priv = channels; - scmi_vdev = vdev; + /* Ensure initialized scmi_vdev is visible */ + smp_store_mb(scmi_vdev, vdev); return 0; } static void scmi_vio_remove(struct virtio_device *vdev) { + /* + * Once we get here, virtio_chan_free() will have already been called by + * the SCMI core for any existing channel and, as a consequence, all the + * virtio channels will have been already marked NOT ready, causing any + * outstanding message on any vqueue to be ignored by complete_cb: now + * we can just stop processing buffers and destroy the vqueues. + */ vdev->config->reset(vdev); vdev->config->del_vqs(vdev); - scmi_vdev = NULL; + /* Ensure scmi_vdev is visible as NULL */ + smp_store_mb(scmi_vdev, NULL); } static int scmi_vio_validate(struct virtio_device *vdev) @@ -476,7 +494,7 @@ static int __init virtio_scmi_init(void) return register_virtio_driver(&virtio_scmi_driver); } -static void __exit virtio_scmi_exit(void) +static void virtio_scmi_exit(void) { unregister_virtio_driver(&virtio_scmi_driver); } diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 73bdbd207e7a..6ec8edec6329 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -25,8 +25,6 @@ #include <acpi/ghes.h> #include <ras/ras_event.h> -static char rcd_decode_str[CPER_REC_LEN]; - /* * CPER record ID need to be unique even after reboot, because record * ID is used as index for ERST storage, while CPER records from @@ -312,6 +310,7 @@ const char *cper_mem_err_unpack(struct trace_seq *p, struct cper_mem_err_compact *cmem) { const char *ret = trace_seq_buffer_ptr(p); + char rcd_decode_str[CPER_REC_LEN]; if (cper_mem_err_location(cmem, rcd_decode_str)) trace_seq_printf(p, "%s", rcd_decode_str); @@ -326,6 +325,7 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem, int len) { struct cper_mem_err_compact cmem; + char rcd_decode_str[CPER_REC_LEN]; /* Don't trust UEFI 2.1/2.2 structure with bad validation bits */ if (len == sizeof(struct cper_sec_mem_err_old) && diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index 365c3a43a198..fe567be0f118 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -271,7 +271,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, return status; } - efi_info("Exiting boot services and installing virtual address map...\n"); + efi_info("Exiting boot services...\n"); map.map = &memory_map; status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, ULONG_MAX); diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 1410beaef5c3..f3e54f6616f0 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -414,7 +414,7 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { - if (down_interruptible(&efi_runtime_lock)) { + if (down_trylock(&efi_runtime_lock)) { pr_warn("failed to invoke the reset_system() runtime service:\n" "could not get exclusive access to the firmware\n"); return; diff --git a/drivers/fpga/ice40-spi.c b/drivers/fpga/ice40-spi.c index 69dec5af23c3..029d3cdb918d 100644 --- a/drivers/fpga/ice40-spi.c +++ b/drivers/fpga/ice40-spi.c @@ -192,12 +192,19 @@ static const struct of_device_id ice40_fpga_of_match[] = { }; MODULE_DEVICE_TABLE(of, ice40_fpga_of_match); +static const struct spi_device_id ice40_fpga_spi_ids[] = { + { .name = "ice40-fpga-mgr", }, + {}, +}; +MODULE_DEVICE_TABLE(spi, ice40_fpga_spi_ids); + static struct spi_driver ice40_fpga_driver = { .probe = ice40_fpga_probe, .driver = { .name = "ice40spi", .of_match_table = of_match_ptr(ice40_fpga_of_match), }, + .id_table = ice40_fpga_spi_ids, }; module_spi_driver(ice40_fpga_driver); diff --git a/drivers/gpio/gpio-74x164.c b/drivers/gpio/gpio-74x164.c index 05637d585152..4a55cdf089d6 100644 --- a/drivers/gpio/gpio-74x164.c +++ b/drivers/gpio/gpio-74x164.c @@ -174,6 +174,13 @@ static int gen_74x164_remove(struct spi_device *spi) return 0; } +static const struct spi_device_id gen_74x164_spi_ids[] = { + { .name = "74hc595" }, + { .name = "74lvc594" }, + {}, +}; +MODULE_DEVICE_TABLE(spi, gen_74x164_spi_ids); + static const struct of_device_id gen_74x164_dt_ids[] = { { .compatible = "fairchild,74hc595" }, { .compatible = "nxp,74lvc594" }, @@ -188,6 +195,7 @@ static struct spi_driver gen_74x164_driver = { }, .probe = gen_74x164_probe, .remove = gen_74x164_remove, + .id_table = gen_74x164_spi_ids, }; module_spi_driver(gen_74x164_driver); diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index 177d03ef4529..40a052bc6784 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -256,6 +256,11 @@ mlxbf2_gpio_probe(struct platform_device *pdev) NULL, 0); + if (ret) { + dev_err(dev, "bgpio_init failed\n"); + return ret; + } + gc->direction_input = mlxbf2_gpio_direction_input; gc->direction_output = mlxbf2_gpio_direction_output; gc->ngpio = npins; diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index 0a9d746a0fe0..d26bff29157b 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -476,10 +476,19 @@ static struct platform_device *gpio_mockup_pdevs[GPIO_MOCKUP_MAX_GC]; static void gpio_mockup_unregister_pdevs(void) { + struct platform_device *pdev; + struct fwnode_handle *fwnode; int i; - for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++) - platform_device_unregister(gpio_mockup_pdevs[i]); + for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++) { + pdev = gpio_mockup_pdevs[i]; + if (!pdev) + continue; + + fwnode = dev_fwnode(&pdev->dev); + platform_device_unregister(pdev); + fwnode_remove_software_node(fwnode); + } } static __init char **gpio_mockup_make_line_names(const char *label, @@ -508,6 +517,7 @@ static int __init gpio_mockup_register_chip(int idx) struct property_entry properties[GPIO_MOCKUP_MAX_PROP]; struct platform_device_info pdevinfo; struct platform_device *pdev; + struct fwnode_handle *fwnode; char **line_names = NULL; char chip_label[32]; int prop = 0, base; @@ -536,13 +546,18 @@ static int __init gpio_mockup_register_chip(int idx) "gpio-line-names", line_names, ngpio); } + fwnode = fwnode_create_software_node(properties, NULL); + if (IS_ERR(fwnode)) + return PTR_ERR(fwnode); + pdevinfo.name = "gpio-mockup"; pdevinfo.id = idx; - pdevinfo.properties = properties; + pdevinfo.fwnode = fwnode; pdev = platform_device_register_full(&pdevinfo); kfree_strarray(line_names, ngpio); if (IS_ERR(pdev)) { + fwnode_remove_software_node(fwnode); pr_err("error registering device"); return PTR_ERR(pdev); } diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index 8ebf369b3ba0..d2fe76f3f34f 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -559,21 +559,21 @@ static int pca953x_gpio_set_pull_up_down(struct pca953x_chip *chip, mutex_lock(&chip->i2c_lock); - /* Disable pull-up/pull-down */ - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); - if (ret) - goto exit; - /* Configure pull-up/pull-down */ if (config == PIN_CONFIG_BIAS_PULL_UP) ret = regmap_write_bits(chip->regmap, pull_sel_reg, bit, bit); else if (config == PIN_CONFIG_BIAS_PULL_DOWN) ret = regmap_write_bits(chip->regmap, pull_sel_reg, bit, 0); + else + ret = 0; if (ret) goto exit; - /* Enable pull-up/pull-down */ - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); + /* Disable/Enable pull-up/pull-down */ + if (config == PIN_CONFIG_BIAS_DISABLE) + ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); + else + ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); exit: mutex_unlock(&chip->i2c_lock); @@ -587,7 +587,9 @@ static int pca953x_gpio_set_config(struct gpio_chip *gc, unsigned int offset, switch (pinconf_to_config_param(config)) { case PIN_CONFIG_BIAS_PULL_UP: + case PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: case PIN_CONFIG_BIAS_PULL_DOWN: + case PIN_CONFIG_BIAS_DISABLE: return pca953x_gpio_set_pull_up_down(chip, offset, config); default: return -ENOTSUPP; diff --git a/drivers/gpio/gpio-xgs-iproc.c b/drivers/gpio/gpio-xgs-iproc.c index fa9b4d8c3ff5..43ca52fa6f9a 100644 --- a/drivers/gpio/gpio-xgs-iproc.c +++ b/drivers/gpio/gpio-xgs-iproc.c @@ -224,7 +224,7 @@ static int iproc_gpio_probe(struct platform_device *pdev) } chip->gc.label = dev_name(dev); - if (of_property_read_u32(dn, "ngpios", &num_gpios)) + if (!of_property_read_u32(dn, "ngpios", &num_gpios)) chip->gc.ngpio = num_gpios; irq = platform_get_irq(pdev, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d356e329e6f8..269437b01328 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1087,6 +1087,7 @@ struct amdgpu_device { bool no_hw_access; struct pci_saved_state *pci_state; + pci_channel_state_t pci_channel_state; struct amdgpu_reset_control *reset_cntl; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 2d6b2d77b738..054c1a224def 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -563,6 +563,7 @@ kfd_mem_dmaunmap_userptr(struct kgd_mem *mem, dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0); sg_free_table(ttm->sg); + kfree(ttm->sg); ttm->sg = NULL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ab3794c42d36..af9bdf16eefd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2394,10 +2394,6 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; - r = amdgpu_amdkfd_resume_iommu(adev); - if (r) - goto init_failed; - r = amdgpu_device_ip_hw_init_phase1(adev); if (r) goto init_failed; @@ -2436,6 +2432,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); + r = amdgpu_amdkfd_resume_iommu(adev); + if (r) + goto init_failed; + amdgpu_fru_get_product_info(adev); init_failed: @@ -5399,6 +5399,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta return PCI_ERS_RESULT_DISCONNECT; } + adev->pci_channel_state = state; + switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; @@ -5541,6 +5543,10 @@ void amdgpu_pci_resume(struct pci_dev *pdev) DRM_INFO("PCI error: resume callback!!\n"); + /* Only continue execution for the case of pci_channel_io_frozen */ + if (adev->pci_channel_state != pci_channel_io_frozen) + return; + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e7f06bd0f0cd..1916ec84dd71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -31,6 +31,8 @@ /* delay 0.1 second to enable gfx off feature */ #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) +#define GFX_OFF_NO_DELAY 0 + /* * GPU GFX IP block helpers function. */ @@ -558,6 +560,8 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev) void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) { + unsigned long delay = GFX_OFF_DELAY_ENABLE; + if (!(adev->pm.pp_feature & PP_GFXOFF_MASK)) return; @@ -573,8 +577,14 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) adev->gfx.gfx_off_req_count--; - if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state) - schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE); + if (adev->gfx.gfx_off_req_count == 0 && + !adev->gfx.gfx_off_state) { + /* If going to s2idle, no need to wait */ + if (adev->in_s0ix) + delay = GFX_OFF_NO_DELAY; + schedule_delayed_work(&adev->gfx.gfx_off_delay_work, + delay); + } } else { if (adev->gfx.gfx_off_req_count == 0) { cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work); diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index ff80786e3918..01efda4398e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1257,7 +1257,7 @@ static int nv_common_early_init(void *handle) AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_JPEG; if (adev->pdev->device == 0x1681) - adev->external_rev_id = adev->rev_id + 0x19; + adev->external_rev_id = 0x20; else adev->external_rev_id = adev->rev_id + 0x01; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c2a4d920da40..4a416231b24c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1085,18 +1085,12 @@ static int kfd_resume(struct kfd_dev *kfd) int err = 0; err = kfd->dqm->ops.start(kfd->dqm); - if (err) { + if (err) dev_err(kfd_device, "Error starting queue manager for device %x:%x\n", kfd->pdev->vendor, kfd->pdev->device); - goto dqm_start_error; - } return err; - -dqm_start_error: - kfd_iommu_suspend(kfd); - return err; } static inline void kfd_queue_work(struct workqueue_struct *wq, diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 7dffc04a557e..127667e549c1 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -25,6 +25,8 @@ config DRM_AMD_DC_HDCP config DRM_AMD_DC_SI bool "AMD DC support for Southern Islands ASICs" + depends on DRM_AMDGPU_SI + depends on DRM_AMD_DC default n help Choose this option to enable new AMD DC support for SI asics diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c index 87daa78a32b8..8080bba5b7a7 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c @@ -263,7 +263,7 @@ static ssize_t dp_link_settings_write(struct file *f, const char __user *buf, if (!wr_buf) return -ENOSPC; - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -487,7 +487,7 @@ static ssize_t dp_phy_settings_write(struct file *f, const char __user *buf, if (!wr_buf) return -ENOSPC; - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -639,7 +639,7 @@ static ssize_t dp_phy_test_pattern_debugfs_write(struct file *f, const char __us if (!wr_buf) return -ENOSPC; - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -914,7 +914,7 @@ static ssize_t dp_dsc_passthrough_set(struct file *f, const char __user *buf, return -ENOSPC; } - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, ¶m, buf, max_param_num, ¶m_nums)) { @@ -1211,7 +1211,7 @@ static ssize_t trigger_hotplug(struct file *f, const char __user *buf, return -ENOSPC; } - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -1396,7 +1396,7 @@ static ssize_t dp_dsc_clock_en_write(struct file *f, const char __user *buf, return -ENOSPC; } - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -1581,7 +1581,7 @@ static ssize_t dp_dsc_slice_width_write(struct file *f, const char __user *buf, return -ENOSPC; } - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -1766,7 +1766,7 @@ static ssize_t dp_dsc_slice_height_write(struct file *f, const char __user *buf, return -ENOSPC; } - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -1944,7 +1944,7 @@ static ssize_t dp_dsc_bits_per_pixel_write(struct file *f, const char __user *bu return -ENOSPC; } - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { @@ -2382,7 +2382,7 @@ static ssize_t dp_max_bpc_write(struct file *f, const char __user *buf, return -ENOSPC; } - if (parse_write_buffer_into_params(wr_buf, size, + if (parse_write_buffer_into_params(wr_buf, wr_buf_size, (long *)param, buf, max_param_num, ¶m_nums)) { diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c index 4a4894e9d9c9..377c4e53a2b3 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c @@ -366,32 +366,32 @@ static struct wm_table lpddr5_wm_table = { .wm_inst = WM_A, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 5.32, - .sr_enter_plus_exit_time_us = 6.38, + .sr_exit_time_us = 11.5, + .sr_enter_plus_exit_time_us = 14.5, .valid = true, }, { .wm_inst = WM_B, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.82, - .sr_enter_plus_exit_time_us = 11.196, + .sr_exit_time_us = 11.5, + .sr_enter_plus_exit_time_us = 14.5, .valid = true, }, { .wm_inst = WM_C, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.89, - .sr_enter_plus_exit_time_us = 11.24, + .sr_exit_time_us = 11.5, + .sr_enter_plus_exit_time_us = 14.5, .valid = true, }, { .wm_inst = WM_D, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.65333, - .sr_exit_time_us = 9.748, - .sr_enter_plus_exit_time_us = 11.102, + .sr_exit_time_us = 11.5, + .sr_enter_plus_exit_time_us = 14.5, .valid = true, }, } @@ -518,14 +518,21 @@ static unsigned int find_clk_for_voltage( unsigned int voltage) { int i; + int max_voltage = 0; + int clock = 0; for (i = 0; i < NUM_SOC_VOLTAGE_LEVELS; i++) { - if (clock_table->SocVoltage[i] == voltage) + if (clock_table->SocVoltage[i] == voltage) { return clocks[i]; + } else if (clock_table->SocVoltage[i] >= max_voltage && + clock_table->SocVoltage[i] < voltage) { + max_voltage = clock_table->SocVoltage[i]; + clock = clocks[i]; + } } - ASSERT(0); - return 0; + ASSERT(clock); + return clock; } void dcn31_clk_mgr_helper_populate_bw_params( diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 05eaec03d9f7..6d655e158267 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -1306,12 +1306,6 @@ static void override_training_settings( { uint32_t lane; - /* Override link settings */ - if (link->preferred_link_setting.link_rate != LINK_RATE_UNKNOWN) - lt_settings->link_settings.link_rate = link->preferred_link_setting.link_rate; - if (link->preferred_link_setting.lane_count != LANE_COUNT_UNKNOWN) - lt_settings->link_settings.lane_count = link->preferred_link_setting.lane_count; - /* Override link spread */ if (!link->dp_ss_off && overrides->downspread != NULL) lt_settings->link_settings.link_spread = *overrides->downspread ? diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h index d8b22618b79e..c337588231ff 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_link_encoder.h @@ -118,6 +118,7 @@ struct dcn10_link_enc_registers { uint32_t RDPCSTX_PHY_CNTL4; uint32_t RDPCSTX_PHY_CNTL5; uint32_t RDPCSTX_PHY_CNTL6; + uint32_t RDPCSPIPE_PHY_CNTL6; uint32_t RDPCSTX_PHY_CNTL7; uint32_t RDPCSTX_PHY_CNTL8; uint32_t RDPCSTX_PHY_CNTL9; diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c index 90127c1f9e35..b0892443fbd5 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.c @@ -37,6 +37,7 @@ #include "link_enc_cfg.h" #include "dc_dmub_srv.h" +#include "dal_asic_id.h" #define CTX \ enc10->base.ctx @@ -62,6 +63,10 @@ #define AUX_REG_WRITE(reg_name, val) \ dm_write_reg(CTX, AUX_REG(reg_name), val) +#ifndef MIN +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#endif + void dcn31_link_encoder_set_dio_phy_mux( struct link_encoder *enc, enum encoder_type_select sel, @@ -215,8 +220,8 @@ static const struct link_encoder_funcs dcn31_link_enc_funcs = { .fec_is_active = enc2_fec_is_active, .get_dig_frontend = dcn10_get_dig_frontend, .get_dig_mode = dcn10_get_dig_mode, - .is_in_alt_mode = dcn20_link_encoder_is_in_alt_mode, - .get_max_link_cap = dcn20_link_encoder_get_max_link_cap, + .is_in_alt_mode = dcn31_link_encoder_is_in_alt_mode, + .get_max_link_cap = dcn31_link_encoder_get_max_link_cap, .set_dio_phy_mux = dcn31_link_encoder_set_dio_phy_mux, }; @@ -404,3 +409,60 @@ void dcn31_link_encoder_disable_output( } } +bool dcn31_link_encoder_is_in_alt_mode(struct link_encoder *enc) +{ + struct dcn10_link_encoder *enc10 = TO_DCN10_LINK_ENC(enc); + uint32_t dp_alt_mode_disable; + bool is_usb_c_alt_mode = false; + + if (enc->features.flags.bits.DP_IS_USB_C) { + if (enc->ctx->asic_id.hw_internal_rev != YELLOW_CARP_B0) { + // [Note] no need to check hw_internal_rev once phy mux selection is ready + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, &dp_alt_mode_disable); + } else { + /* + * B0 phys use a new set of registers to check whether alt mode is disabled. + * if value == 1 alt mode is disabled, otherwise it is enabled. + */ + if ((enc10->base.transmitter == TRANSMITTER_UNIPHY_A) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_B) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_E)) { + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, &dp_alt_mode_disable); + } else { + // [Note] need to change TRANSMITTER_UNIPHY_C/D to F/G once phy mux selection is ready + REG_GET(RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, &dp_alt_mode_disable); + } + } + + is_usb_c_alt_mode = (dp_alt_mode_disable == 0); + } + + return is_usb_c_alt_mode; +} + +void dcn31_link_encoder_get_max_link_cap(struct link_encoder *enc, + struct dc_link_settings *link_settings) +{ + struct dcn10_link_encoder *enc10 = TO_DCN10_LINK_ENC(enc); + uint32_t is_in_usb_c_dp4_mode = 0; + + dcn10_link_encoder_get_max_link_cap(enc, link_settings); + + /* in usb c dp2 mode, max lane count is 2 */ + if (enc->funcs->is_in_alt_mode && enc->funcs->is_in_alt_mode(enc)) { + if (enc->ctx->asic_id.hw_internal_rev != YELLOW_CARP_B0) { + // [Note] no need to check hw_internal_rev once phy mux selection is ready + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, &is_in_usb_c_dp4_mode); + } else { + if ((enc10->base.transmitter == TRANSMITTER_UNIPHY_A) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_B) + || (enc10->base.transmitter == TRANSMITTER_UNIPHY_E)) { + REG_GET(RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, &is_in_usb_c_dp4_mode); + } else { + REG_GET(RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, &is_in_usb_c_dp4_mode); + } + } + if (!is_in_usb_c_dp4_mode) + link_settings->lane_count = MIN(LANE_COUNT_TWO, link_settings->lane_count); + } +} diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h index 32d146312838..3454f1e7c1f1 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_dio_link_encoder.h @@ -69,6 +69,7 @@ SRI(RDPCSTX_PHY_CNTL4, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL5, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL6, RDPCSTX, id), \ + SRI(RDPCSPIPE_PHY_CNTL6, RDPCSPIPE, id), \ SRI(RDPCSTX_PHY_CNTL7, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL8, RDPCSTX, id), \ SRI(RDPCSTX_PHY_CNTL9, RDPCSTX, id), \ @@ -115,7 +116,9 @@ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DP_TX2_MPLL_EN, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DP_TX3_MPLL_EN, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, mask_sh),\ - LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, mask_sh),\ + LE_SF(RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DP4, mask_sh),\ + LE_SF(RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE, mask_sh),\ + LE_SF(RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6, RDPCS_PHY_DPALT_DISABLE_ACK, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL7, RDPCS_PHY_DP_MPLLB_FRACN_QUOT, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL7, RDPCS_PHY_DP_MPLLB_FRACN_DEN, mask_sh),\ LE_SF(RDPCSTX0_RDPCSTX_PHY_CNTL8, RDPCS_PHY_DP_MPLLB_SSC_PEAK, mask_sh),\ @@ -243,4 +246,13 @@ void dcn31_link_encoder_disable_output( struct link_encoder *enc, enum signal_type signal); +/* + * Check whether USB-C DP Alt mode is disabled + */ +bool dcn31_link_encoder_is_in_alt_mode( + struct link_encoder *enc); + +void dcn31_link_encoder_get_max_link_cap(struct link_encoder *enc, + struct dc_link_settings *link_settings); + #endif /* __DC_LINK_ENCODER__DCN31_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c index 3f2333ec67e2..3afa1159a5f7 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c @@ -76,10 +76,6 @@ void dcn31_init_hw(struct dc *dc) if (dc->clk_mgr && dc->clk_mgr->funcs->init_clocks) dc->clk_mgr->funcs->init_clocks(dc->clk_mgr); - // Initialize the dccg - if (res_pool->dccg->funcs->dccg_init) - res_pool->dccg->funcs->dccg_init(res_pool->dccg); - if (IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment)) { REG_WRITE(REFCLK_CNTL, 0); @@ -106,6 +102,9 @@ void dcn31_init_hw(struct dc *dc) hws->funcs.bios_golden_init(dc); hws->funcs.disable_vga(dc->hwseq); } + // Initialize the dccg + if (res_pool->dccg->funcs->dccg_init) + res_pool->dccg->funcs->dccg_init(res_pool->dccg); if (dc->debug.enable_mem_low_power.bits.dmcu) { // Force ERAM to shutdown if DMCU is not enabled diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c index a7702d3c75cd..79e92ecca96c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c @@ -217,8 +217,8 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_1_soc = { .num_states = 5, .sr_exit_time_us = 9.0, .sr_enter_plus_exit_time_us = 11.0, - .sr_exit_z8_time_us = 402.0, - .sr_enter_plus_exit_z8_time_us = 520.0, + .sr_exit_z8_time_us = 442.0, + .sr_enter_plus_exit_z8_time_us = 560.0, .writeback_latency_us = 12.0, .dram_channel_width_bytes = 4, .round_trip_ping_latency_dcfclk_cycles = 106, @@ -928,7 +928,7 @@ static const struct dc_debug_options debug_defaults_drv = { .disable_dcc = DCC_ENABLE, .vsr_support = true, .performance_trace = false, - .max_downscale_src_width = 7680,/*upto 8K*/ + .max_downscale_src_width = 4096,/*upto true 4K*/ .disable_pplib_wm_range = false, .scl_reset_length10 = true, .sanity_checks = false, @@ -1284,6 +1284,12 @@ static struct stream_encoder *dcn31_stream_encoder_create( if (!enc1 || !vpg || !afmt) return NULL; + if (ctx->asic_id.chip_family == FAMILY_YELLOW_CARP && + ctx->asic_id.hw_internal_rev == YELLOW_CARP_B0) { + if ((eng_id == ENGINE_ID_DIGC) || (eng_id == ENGINE_ID_DIGD)) + eng_id = eng_id + 3; // For B0 only. C->F, D->G. + } + dcn30_dio_stream_encoder_construct(enc1, ctx, ctx->dc_bios, eng_id, vpg, afmt, &stream_enc_regs[eng_id], @@ -1584,6 +1590,13 @@ static int dcn31_populate_dml_pipes_from_context( pipe = &res_ctx->pipe_ctx[i]; timing = &pipe->stream->timing; + /* + * Immediate flip can be set dynamically after enabling the plane. + * We need to require support for immediate flip or underflow can be + * intermittently experienced depending on peak b/w requirements. + */ + pipes[pipe_cnt].pipe.src.immediate_flip = true; + pipes[pipe_cnt].pipe.src.unbounded_req_mode = false; pipes[pipe_cnt].pipe.src.gpuvm = true; pipes[pipe_cnt].pipe.src.dcc_fraction_of_zs_req_luma = 0; diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c index ce55c9caf9a2..d58925cff420 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c @@ -5398,9 +5398,9 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l v->MaximumReadBandwidthWithPrefetch = v->MaximumReadBandwidthWithPrefetch - + dml_max4( - v->VActivePixelBandwidth[i][j][k], - v->VActiveCursorBandwidth[i][j][k] + + dml_max3( + v->VActivePixelBandwidth[i][j][k] + + v->VActiveCursorBandwidth[i][j][k] + v->NoOfDPP[i][j][k] * (v->meta_row_bandwidth[i][j][k] + v->dpte_row_bandwidth[i][j][k]), diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h index 381c17caace1..3d2f0817e40a 100644 --- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h +++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h @@ -227,7 +227,7 @@ enum { #define FAMILY_YELLOW_CARP 146 #define YELLOW_CARP_A0 0x01 -#define YELLOW_CARP_B0 0x02 // TODO: DCN31 - update with correct B0 ID +#define YELLOW_CARP_B0 0x20 #define YELLOW_CARP_UNKNOWN 0xFF #ifndef ASICREV_IS_YELLOW_CARP diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c index e9bd84ec027d..be61975f1470 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c @@ -105,6 +105,7 @@ static enum mod_hdcp_status remove_display_from_topology_v3( dtm_cmd->dtm_status = TA_DTM_STATUS__GENERIC_FAILURE; psp_dtm_invoke(psp, dtm_cmd->cmd_id); + mutex_unlock(&psp->dtm_context.mutex); if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) { status = remove_display_from_topology_v2(hdcp, index); @@ -115,8 +116,6 @@ static enum mod_hdcp_status remove_display_from_topology_v3( HDCP_TOP_REMOVE_DISPLAY_TRACE(hdcp, display->index); } - mutex_unlock(&psp->dtm_context.mutex); - return status; } @@ -205,6 +204,7 @@ static enum mod_hdcp_status add_display_to_topology_v3( dtm_cmd->dtm_in_message.topology_update_v3.link_hdcp_cap = link->hdcp_supported_informational; psp_dtm_invoke(psp, dtm_cmd->cmd_id); + mutex_unlock(&psp->dtm_context.mutex); if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) { status = add_display_to_topology_v2(hdcp, display); @@ -214,8 +214,6 @@ static enum mod_hdcp_status add_display_to_topology_v3( HDCP_TOP_ADD_DISPLAY_TRACE(hdcp, display->index); } - mutex_unlock(&psp->dtm_context.mutex); - return status; } diff --git a/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h index 92caf8441d1e..01a56556cde1 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h +++ b/drivers/gpu/drm/amd/include/asic_reg/dpcs/dpcs_4_2_0_offset.h @@ -11932,5 +11932,32 @@ #define ixDPCSSYS_CR4_RAWLANEX_DIG_PCS_XF_RX_OVRD_OUT_2 0xe0c7 #define ixDPCSSYS_CR4_RAWLANEX_DIG_PCS_XF_TX_OVRD_IN_2 0xe0c8 +//RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4__SHIFT 0x10 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE__SHIFT 0x11 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK__SHIFT 0x12 +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4_MASK 0x00010000L +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_MASK 0x00020000L +#define RDPCSPIPE0_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK_MASK 0x00040000L + +//RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4__SHIFT 0x10 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE__SHIFT 0x11 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK__SHIFT 0x12 +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DP4_MASK 0x00010000L +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_MASK 0x00020000L +#define RDPCSPIPE1_RDPCSPIPE_PHY_CNTL6__RDPCS_PHY_DPALT_DISABLE_ACK_MASK 0x00040000L + +//[Note] Hack. RDPCSPIPE only has 2 instances. +#define regRDPCSPIPE0_RDPCSPIPE_PHY_CNTL6 0x2d73 +#define regRDPCSPIPE0_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE1_RDPCSPIPE_PHY_CNTL6 0x2e4b +#define regRDPCSPIPE1_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE2_RDPCSPIPE_PHY_CNTL6 0x2d73 +#define regRDPCSPIPE2_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE3_RDPCSPIPE_PHY_CNTL6 0x2e4b +#define regRDPCSPIPE3_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 +#define regRDPCSPIPE4_RDPCSPIPE_PHY_CNTL6 0x2d73 +#define regRDPCSPIPE4_RDPCSPIPE_PHY_CNTL6_BASE_IDX 2 #endif diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c index 6bfaefa01818..1e30eaeb0e1b 100644 --- a/drivers/gpu/drm/ast/ast_mode.c +++ b/drivers/gpu/drm/ast/ast_mode.c @@ -1300,18 +1300,6 @@ static enum drm_mode_status ast_mode_valid(struct drm_connector *connector, return flags; } -static enum drm_connector_status ast_connector_detect(struct drm_connector - *connector, bool force) -{ - int r; - - r = ast_get_modes(connector); - if (r <= 0) - return connector_status_disconnected; - - return connector_status_connected; -} - static void ast_connector_destroy(struct drm_connector *connector) { struct ast_connector *ast_connector = to_ast_connector(connector); @@ -1327,7 +1315,6 @@ static const struct drm_connector_helper_funcs ast_connector_helper_funcs = { static const struct drm_connector_funcs ast_connector_funcs = { .reset = drm_atomic_helper_connector_reset, - .detect = ast_connector_detect, .fill_modes = drm_helper_probe_single_connector_modes, .destroy = ast_connector_destroy, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, @@ -1355,8 +1342,7 @@ static int ast_connector_init(struct drm_device *dev) connector->interlace_allowed = 0; connector->doublescan_allowed = 0; - connector->polled = DRM_CONNECTOR_POLL_CONNECT | - DRM_CONNECTOR_POLL_DISCONNECT; + connector->polled = DRM_CONNECTOR_POLL_CONNECT; drm_connector_attach_encoder(connector, encoder); @@ -1425,8 +1411,6 @@ int ast_mode_config_init(struct ast_private *ast) drm_mode_config_reset(dev); - drm_kms_helper_poll_init(dev); - return 0; } diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 6325877c5fd6..ea9a79bc9583 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -1834,11 +1834,20 @@ static void connector_bad_edid(struct drm_connector *connector, u8 *edid, int num_blocks) { int i; - u8 num_of_ext = edid[0x7e]; + u8 last_block; + + /* + * 0x7e in the EDID is the number of extension blocks. The EDID + * is 1 (base block) + num_ext_blocks big. That means we can think + * of 0x7e in the EDID of the _index_ of the last block in the + * combined chunk of memory. + */ + last_block = edid[0x7e]; /* Calculate real checksum for the last edid extension block data */ - connector->real_edid_checksum = - drm_edid_block_checksum(edid + num_of_ext * EDID_LENGTH); + if (last_block < num_blocks) + connector->real_edid_checksum = + drm_edid_block_checksum(edid + last_block * EDID_LENGTH); if (connector->bad_edid_counter++ && !drm_debug_enabled(DRM_UT_KMS)) return; diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 3ab078321045..8e7a124d6c5a 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -1506,6 +1506,7 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper, { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; + struct drm_mode_config *config = &dev->mode_config; int ret = 0; int crtc_count = 0; struct drm_connector_list_iter conn_iter; @@ -1663,6 +1664,11 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper, /* Handle our overallocation */ sizes.surface_height *= drm_fbdev_overalloc; sizes.surface_height /= 100; + if (sizes.surface_height > config->max_height) { + drm_dbg_kms(dev, "Fbdev over-allocation too large; clamping height to %d\n", + config->max_height); + sizes.surface_height = config->max_height; + } /* push down into drivers */ ret = (*fb_helper->funcs->fb_probe)(fb_helper, &sizes); diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c index fcfe1a03c4a1..bf8a6e823a15 100644 --- a/drivers/gpu/drm/drm_modeset_lock.c +++ b/drivers/gpu/drm/drm_modeset_lock.c @@ -248,7 +248,7 @@ static inline int modeset_lock(struct drm_modeset_lock *lock, if (ctx->trylock_only) { lockdep_assert_held(&ctx->ww_ctx); - if (!ww_mutex_trylock(&lock->mutex)) + if (!ww_mutex_trylock(&lock->mutex, NULL)) return -EBUSY; else return 0; diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index f6bdec7fa925..e1b2ce4921ae 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -134,6 +134,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T103HAF"), }, .driver_data = (void *)&lcd800x1280_rightside_up, + }, { /* AYA NEO 2021 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "AYADEVICE"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "AYA NEO 2021"), + }, + .driver_data = (void *)&lcd800x1280_rightside_up, }, { /* GPD MicroPC (generic strings, also match on bios date) */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Default string"), @@ -185,6 +191,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "Default string"), }, .driver_data = (void *)&gpd_win2, + }, { /* GPD Win 3 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "G1618-03") + }, + .driver_data = (void *)&lcd720x1280_rightside_up, }, { /* I.T.Works TW891 */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "To be filled by O.E.M."), diff --git a/drivers/gpu/drm/hyperv/hyperv_drm.h b/drivers/gpu/drm/hyperv/hyperv_drm.h index 886add4f9cd0..d2d8582b36df 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm.h +++ b/drivers/gpu/drm/hyperv/hyperv_drm.h @@ -46,6 +46,7 @@ int hyperv_mode_config_init(struct hyperv_drm_device *hv); int hyperv_update_vram_location(struct hv_device *hdev, phys_addr_t vram_pp); int hyperv_update_situation(struct hv_device *hdev, u8 active, u32 bpp, u32 w, u32 h, u32 pitch); +int hyperv_hide_hw_ptr(struct hv_device *hdev); int hyperv_update_dirt(struct hv_device *hdev, struct drm_rect *rect); int hyperv_connect_vsp(struct hv_device *hdev); diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c index 6dd4717d3e1e..8c97a20dfe23 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c +++ b/drivers/gpu/drm/hyperv/hyperv_drm_modeset.c @@ -101,6 +101,7 @@ static void hyperv_pipe_enable(struct drm_simple_display_pipe *pipe, struct hyperv_drm_device *hv = to_hv(pipe->crtc.dev); struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(plane_state); + hyperv_hide_hw_ptr(hv->hdev); hyperv_update_situation(hv->hdev, 1, hv->screen_depth, crtc_state->mode.hdisplay, crtc_state->mode.vdisplay, diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_proto.c b/drivers/gpu/drm/hyperv/hyperv_drm_proto.c index 6d4bdccfbd1a..c0155c6271bf 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm_proto.c +++ b/drivers/gpu/drm/hyperv/hyperv_drm_proto.c @@ -299,6 +299,55 @@ int hyperv_update_situation(struct hv_device *hdev, u8 active, u32 bpp, return 0; } +/* + * Hyper-V supports a hardware cursor feature. It's not used by Linux VM, + * but the Hyper-V host still draws a point as an extra mouse pointer, + * which is unwanted, especially when Xorg is running. + * + * The hyperv_fb driver uses synthvid_send_ptr() to hide the unwanted + * pointer, by setting msg.ptr_pos.is_visible = 1 and setting the + * msg.ptr_shape.data. Note: setting msg.ptr_pos.is_visible to 0 doesn't + * work in tests. + * + * Copy synthvid_send_ptr() to hyperv_drm and rename it to + * hyperv_hide_hw_ptr(). Note: hyperv_hide_hw_ptr() is also called in the + * handler of the SYNTHVID_FEATURE_CHANGE event, otherwise the host still + * draws an extra unwanted mouse pointer after the VM Connection window is + * closed and reopened. + */ +int hyperv_hide_hw_ptr(struct hv_device *hdev) +{ + struct synthvid_msg msg; + + memset(&msg, 0, sizeof(struct synthvid_msg)); + msg.vid_hdr.type = SYNTHVID_POINTER_POSITION; + msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) + + sizeof(struct synthvid_pointer_position); + msg.ptr_pos.is_visible = 1; + msg.ptr_pos.video_output = 0; + msg.ptr_pos.image_x = 0; + msg.ptr_pos.image_y = 0; + hyperv_sendpacket(hdev, &msg); + + memset(&msg, 0, sizeof(struct synthvid_msg)); + msg.vid_hdr.type = SYNTHVID_POINTER_SHAPE; + msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) + + sizeof(struct synthvid_pointer_shape); + msg.ptr_shape.part_idx = SYNTHVID_CURSOR_COMPLETE; + msg.ptr_shape.is_argb = 1; + msg.ptr_shape.width = 1; + msg.ptr_shape.height = 1; + msg.ptr_shape.hot_x = 0; + msg.ptr_shape.hot_y = 0; + msg.ptr_shape.data[0] = 0; + msg.ptr_shape.data[1] = 1; + msg.ptr_shape.data[2] = 1; + msg.ptr_shape.data[3] = 1; + hyperv_sendpacket(hdev, &msg); + + return 0; +} + int hyperv_update_dirt(struct hv_device *hdev, struct drm_rect *rect) { struct hyperv_drm_device *hv = hv_get_drvdata(hdev); @@ -392,8 +441,11 @@ static void hyperv_receive_sub(struct hv_device *hdev) return; } - if (msg->vid_hdr.type == SYNTHVID_FEATURE_CHANGE) + if (msg->vid_hdr.type == SYNTHVID_FEATURE_CHANGE) { hv->dirt_needed = msg->feature_chg.is_dirt_needed; + if (hv->dirt_needed) + hyperv_hide_hw_ptr(hv->hdev); + } } static void hyperv_receive(void *ctx) diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index 43ec7fcd3f5d..a3eae3f3eadc 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -1577,8 +1577,14 @@ static void gen11_dsi_sync_state(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) { struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); - struct intel_crtc *intel_crtc = to_intel_crtc(crtc_state->uapi.crtc); - enum pipe pipe = intel_crtc->pipe; + struct intel_crtc *intel_crtc; + enum pipe pipe; + + if (!crtc_state) + return; + + intel_crtc = to_intel_crtc(crtc_state->uapi.crtc); + pipe = intel_crtc->pipe; /* wa verify 1409054076:icl,jsl,ehl */ if (DISPLAY_VER(dev_priv) == 11 && pipe == PIPE_B && diff --git a/drivers/gpu/drm/i915/display/intel_acpi.c b/drivers/gpu/drm/i915/display/intel_acpi.c index 7cfe91fc05f2..68abeaf2d7d4 100644 --- a/drivers/gpu/drm/i915/display/intel_acpi.c +++ b/drivers/gpu/drm/i915/display/intel_acpi.c @@ -186,13 +186,16 @@ void intel_dsm_get_bios_data_funcs_supported(struct drm_i915_private *i915) { struct pci_dev *pdev = to_pci_dev(i915->drm.dev); acpi_handle dhandle; + union acpi_object *obj; dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) return; - acpi_evaluate_dsm(dhandle, &intel_dsm_guid2, INTEL_DSM_REVISION_ID, - INTEL_DSM_FN_GET_BIOS_DATA_FUNCS_SUPPORTED, NULL); + obj = acpi_evaluate_dsm(dhandle, &intel_dsm_guid2, INTEL_DSM_REVISION_ID, + INTEL_DSM_FN_GET_BIOS_DATA_FUNCS_SUPPORTED, NULL); + if (obj) + ACPI_FREE(obj); } /* diff --git a/drivers/gpu/drm/i915/display/intel_audio.c b/drivers/gpu/drm/i915/display/intel_audio.c index 532237588511..4e0f96bf6158 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.c +++ b/drivers/gpu/drm/i915/display/intel_audio.c @@ -1308,8 +1308,9 @@ static void i915_audio_component_init(struct drm_i915_private *dev_priv) else aud_freq = aud_freq_init; - /* use BIOS provided value for TGL unless it is a known bad value */ - if (IS_TIGERLAKE(dev_priv) && aud_freq_init != AUD_FREQ_TGL_BROKEN) + /* use BIOS provided value for TGL and RKL unless it is a known bad value */ + if ((IS_TIGERLAKE(dev_priv) || IS_ROCKETLAKE(dev_priv)) && + aud_freq_init != AUD_FREQ_TGL_BROKEN) aud_freq = aud_freq_init; drm_dbg_kms(&dev_priv->drm, "use AUD_FREQ_CNTRL of 0x%x (init value 0x%x)\n", diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index e86e6ed2d3bf..fd71346aac7b 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -451,13 +451,23 @@ parse_lfp_backlight(struct drm_i915_private *i915, } i915->vbt.backlight.type = INTEL_BACKLIGHT_DISPLAY_DDI; - if (bdb->version >= 191 && - get_blocksize(backlight_data) >= sizeof(*backlight_data)) { - const struct lfp_backlight_control_method *method; + if (bdb->version >= 191) { + size_t exp_size; - method = &backlight_data->backlight_control[panel_type]; - i915->vbt.backlight.type = method->type; - i915->vbt.backlight.controller = method->controller; + if (bdb->version >= 236) + exp_size = sizeof(struct bdb_lfp_backlight_data); + else if (bdb->version >= 234) + exp_size = EXP_BDB_LFP_BL_DATA_SIZE_REV_234; + else + exp_size = EXP_BDB_LFP_BL_DATA_SIZE_REV_191; + + if (get_blocksize(backlight_data) >= exp_size) { + const struct lfp_backlight_control_method *method; + + method = &backlight_data->backlight_control[panel_type]; + i915->vbt.backlight.type = method->type; + i915->vbt.backlight.controller = method->controller; + } } i915->vbt.backlight.pwm_freq_hz = entry->pwm_freq_hz; diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 9903a78df896..bd184325d0c7 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -3807,7 +3807,13 @@ void hsw_ddi_get_config(struct intel_encoder *encoder, static void intel_ddi_sync_state(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) { - if (intel_crtc_has_dp_encoder(crtc_state)) + struct drm_i915_private *i915 = to_i915(encoder->base.dev); + enum phy phy = intel_port_to_phy(i915, encoder->port); + + if (intel_phy_is_tc(i915, phy)) + intel_tc_port_sanitize(enc_to_dig_port(encoder)); + + if (crtc_state && intel_crtc_has_dp_encoder(crtc_state)) intel_dp_sync_state(encoder, crtc_state); } diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 134a6acbd8fb..17f44ffea586 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -13082,18 +13082,16 @@ static void intel_modeset_readout_hw_state(struct drm_device *dev) readout_plane_state(dev_priv); for_each_intel_encoder(dev, encoder) { + struct intel_crtc_state *crtc_state = NULL; + pipe = 0; if (encoder->get_hw_state(encoder, &pipe)) { - struct intel_crtc_state *crtc_state; - crtc = intel_get_crtc_for_pipe(dev_priv, pipe); crtc_state = to_intel_crtc_state(crtc->base.state); encoder->base.crtc = &crtc->base; intel_encoder_get_config(encoder, crtc_state); - if (encoder->sync_state) - encoder->sync_state(encoder, crtc_state); /* read out to slave crtc as well for bigjoiner */ if (crtc_state->bigjoiner) { @@ -13108,6 +13106,9 @@ static void intel_modeset_readout_hw_state(struct drm_device *dev) encoder->base.crtc = NULL; } + if (encoder->sync_state) + encoder->sync_state(encoder, crtc_state); + drm_dbg_kms(&dev_priv->drm, "[ENCODER:%d:%s] hw state readout: %s, pipe %c\n", encoder->base.base.id, encoder->base.name, @@ -13390,17 +13391,6 @@ intel_modeset_setup_hw_state(struct drm_device *dev, intel_modeset_readout_hw_state(dev); /* HW state is read out, now we need to sanitize this mess. */ - - /* Sanitize the TypeC port mode upfront, encoders depend on this */ - for_each_intel_encoder(dev, encoder) { - enum phy phy = intel_port_to_phy(dev_priv, encoder->port); - - /* We need to sanitize only the MST primary port. */ - if (encoder->type != INTEL_OUTPUT_DP_MST && - intel_phy_is_tc(dev_priv, phy)) - intel_tc_port_sanitize(enc_to_dig_port(encoder)); - } - get_encoder_power_domains(dev_priv); if (HAS_PCH_IBX(dev_priv)) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index abe3d61b6243..5cf152be4487 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1916,6 +1916,9 @@ void intel_dp_sync_state(struct intel_encoder *encoder, { struct intel_dp *intel_dp = enc_to_intel_dp(encoder); + if (!crtc_state) + return; + /* * Don't clobber DPCD if it's been already read out during output * setup (eDP) or detect. diff --git a/drivers/gpu/drm/i915/display/intel_vbt_defs.h b/drivers/gpu/drm/i915/display/intel_vbt_defs.h index 330077c2e588..a2108a8f544d 100644 --- a/drivers/gpu/drm/i915/display/intel_vbt_defs.h +++ b/drivers/gpu/drm/i915/display/intel_vbt_defs.h @@ -814,6 +814,11 @@ struct lfp_brightness_level { u16 reserved; } __packed; +#define EXP_BDB_LFP_BL_DATA_SIZE_REV_191 \ + offsetof(struct bdb_lfp_backlight_data, brightness_level) +#define EXP_BDB_LFP_BL_DATA_SIZE_REV_234 \ + offsetof(struct bdb_lfp_backlight_data, brightness_precision_bits) + struct bdb_lfp_backlight_data { u8 entry_size; struct lfp_backlight_data_entry data[16]; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 9ccf4b29b82e..166bb46408a9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -937,6 +937,10 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, unsigned int n; e = alloc_engines(num_engines); + if (!e) + return ERR_PTR(-ENOMEM); + e->num_engines = num_engines; + for (n = 0; n < num_engines; n++) { struct intel_context *ce; int ret; @@ -970,7 +974,6 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, goto free_engines; } } - e->num_engines = num_engines; return e; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c index e382b7f2353b..5ab136ffdeb2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c @@ -118,7 +118,7 @@ i915_gem_shrink(struct i915_gem_ww_ctx *ww, intel_wakeref_t wakeref = 0; unsigned long count = 0; unsigned long scanned = 0; - int err; + int err = 0; /* CHV + VTD workaround use stop_machine(); need to trylock vm->mutex */ bool trylock_vm = !ww && intel_vm_no_concurrent_access_wa(i915); @@ -242,12 +242,15 @@ skip: list_splice_tail(&still_in_list, phase->list); spin_unlock_irqrestore(&i915->mm.obj_lock, flags); if (err) - return err; + break; } if (shrink & I915_SHRINK_BOUND) intel_runtime_pm_put(&i915->runtime_pm, wakeref); + if (err) + return err; + if (nr_scanned) *nr_scanned += scanned; return count; diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index e866105dd174..17ca4dc4d0cb 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -421,6 +421,7 @@ void intel_context_fini(struct intel_context *ce) mutex_destroy(&ce->pin_mutex); i915_active_fini(&ce->active); + i915_sw_fence_fini(&ce->guc_blocked); } void i915_context_module_exit(void) diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index 1257f4f11e66..438bbc7b8147 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -64,7 +64,7 @@ intel_timeline_pin_map(struct intel_timeline *timeline) timeline->hwsp_map = vaddr; timeline->hwsp_seqno = memset(vaddr + ofs, 0, TIMELINE_SEQNO_BYTES); - clflush(vaddr + ofs); + drm_clflush_virt_range(vaddr + ofs, TIMELINE_SEQNO_BYTES); return 0; } @@ -225,7 +225,7 @@ void intel_timeline_reset_seqno(const struct intel_timeline *tl) memset(hwsp_seqno + 1, 0, TIMELINE_SEQNO_BYTES - sizeof(*hwsp_seqno)); WRITE_ONCE(*hwsp_seqno, tl->seqno); - clflush(hwsp_seqno); + drm_clflush_virt_range(hwsp_seqno, TIMELINE_SEQNO_BYTES); } void intel_timeline_enter(struct intel_timeline *tl) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 664970f2bc62..9023d4ecf3b3 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -8193,6 +8193,11 @@ enum { #define HSW_SPR_STRETCH_MAX_X1 REG_FIELD_PREP(HSW_SPR_STRETCH_MAX_MASK, 3) #define HSW_FBCQ_DIS (1 << 22) #define BDW_DPRS_MASK_VBLANK_SRD (1 << 0) +#define SKL_PLANE1_STRETCH_MAX_MASK REG_GENMASK(1, 0) +#define SKL_PLANE1_STRETCH_MAX_X8 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 0) +#define SKL_PLANE1_STRETCH_MAX_X4 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 1) +#define SKL_PLANE1_STRETCH_MAX_X2 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 2) +#define SKL_PLANE1_STRETCH_MAX_X1 REG_FIELD_PREP(SKL_PLANE1_STRETCH_MAX_MASK, 3) #define CHICKEN_PIPESL_1(pipe) _MMIO_PIPE(pipe, _CHICKEN_PIPESL_1_A, _CHICKEN_PIPESL_1_B) #define _CHICKEN_TRANS_A 0x420c0 @@ -11043,12 +11048,6 @@ enum skl_power_gate { #define DC_STATE_DEBUG_MASK_CORES (1 << 0) #define DC_STATE_DEBUG_MASK_MEMORY_UP (1 << 1) -#define BXT_P_CR_MC_BIOS_REQ_0_0_0 _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x7114) -#define BXT_REQ_DATA_MASK 0x3F -#define BXT_DRAM_CHANNEL_ACTIVE_SHIFT 12 -#define BXT_DRAM_CHANNEL_ACTIVE_MASK (0xF << 12) -#define BXT_MEMORY_FREQ_MULTIPLIER_HZ 133333333 - #define BXT_D_CR_DRP0_DUNIT8 0x1000 #define BXT_D_CR_DRP0_DUNIT9 0x1200 #define BXT_D_CR_DRP0_DUNIT_START 8 @@ -11079,9 +11078,7 @@ enum skl_power_gate { #define BXT_DRAM_TYPE_LPDDR4 (0x2 << 22) #define BXT_DRAM_TYPE_DDR4 (0x4 << 22) -#define SKL_MEMORY_FREQ_MULTIPLIER_HZ 266666666 #define SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5E04) -#define SKL_REQ_DATA_MASK (0xF << 0) #define DG1_GEAR_TYPE REG_BIT(16) #define SKL_MAD_INTER_CHANNEL_0_0_0_MCHBAR_MCMAIN _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5000) diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index 806ad688274b..63fec1c3c132 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -794,7 +794,6 @@ DECLARE_EVENT_CLASS(i915_request, TP_STRUCT__entry( __field(u32, dev) __field(u64, ctx) - __field(u32, guc_id) __field(u16, class) __field(u16, instance) __field(u32, seqno) @@ -805,16 +804,14 @@ DECLARE_EVENT_CLASS(i915_request, __entry->dev = rq->engine->i915->drm.primary->index; __entry->class = rq->engine->uabi_class; __entry->instance = rq->engine->uabi_instance; - __entry->guc_id = rq->context->guc_id; __entry->ctx = rq->fence.context; __entry->seqno = rq->fence.seqno; __entry->tail = rq->tail; ), - TP_printk("dev=%u, engine=%u:%u, guc_id=%u, ctx=%llu, seqno=%u, tail=%u", + TP_printk("dev=%u, engine=%u:%u, ctx=%llu, seqno=%u, tail=%u", __entry->dev, __entry->class, __entry->instance, - __entry->guc_id, __entry->ctx, __entry->seqno, - __entry->tail) + __entry->ctx, __entry->seqno, __entry->tail) ); DEFINE_EVENT(i915_request, i915_request_add, diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index 5259edacde38..066a9118c374 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <linux/sched/clock.h> struct drm_i915_private; struct timer_list; diff --git a/drivers/gpu/drm/i915/intel_dram.c b/drivers/gpu/drm/i915/intel_dram.c index 91866520c173..7acce64b0941 100644 --- a/drivers/gpu/drm/i915/intel_dram.c +++ b/drivers/gpu/drm/i915/intel_dram.c @@ -244,7 +244,6 @@ static int skl_get_dram_info(struct drm_i915_private *i915) { struct dram_info *dram_info = &i915->dram_info; - u32 mem_freq_khz, val; int ret; dram_info->type = skl_get_dram_type(i915); @@ -255,17 +254,6 @@ skl_get_dram_info(struct drm_i915_private *i915) if (ret) return ret; - val = intel_uncore_read(&i915->uncore, - SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU); - mem_freq_khz = DIV_ROUND_UP((val & SKL_REQ_DATA_MASK) * - SKL_MEMORY_FREQ_MULTIPLIER_HZ, 1000); - - if (dram_info->num_channels * mem_freq_khz == 0) { - drm_info(&i915->drm, - "Couldn't get system memory bandwidth\n"); - return -EINVAL; - } - return 0; } @@ -350,24 +338,10 @@ static void bxt_get_dimm_info(struct dram_dimm_info *dimm, u32 val) static int bxt_get_dram_info(struct drm_i915_private *i915) { struct dram_info *dram_info = &i915->dram_info; - u32 dram_channels; - u32 mem_freq_khz, val; - u8 num_active_channels, valid_ranks = 0; + u32 val; + u8 valid_ranks = 0; int i; - val = intel_uncore_read(&i915->uncore, BXT_P_CR_MC_BIOS_REQ_0_0_0); - mem_freq_khz = DIV_ROUND_UP((val & BXT_REQ_DATA_MASK) * - BXT_MEMORY_FREQ_MULTIPLIER_HZ, 1000); - - dram_channels = val & BXT_DRAM_CHANNEL_ACTIVE_MASK; - num_active_channels = hweight32(dram_channels); - - if (mem_freq_khz * num_active_channels == 0) { - drm_info(&i915->drm, - "Couldn't get system memory bandwidth\n"); - return -EINVAL; - } - /* * Now read each DUNIT8/9/10/11 to check the rank of each dimms. */ diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 65bc3709f54c..a725792d5248 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -76,6 +76,8 @@ struct intel_wm_config { static void gen9_init_clock_gating(struct drm_i915_private *dev_priv) { + enum pipe pipe; + if (HAS_LLC(dev_priv)) { /* * WaCompressedResourceDisplayNewHashMode:skl,kbl @@ -89,6 +91,16 @@ static void gen9_init_clock_gating(struct drm_i915_private *dev_priv) SKL_DE_COMPRESSED_HASH_MODE); } + for_each_pipe(dev_priv, pipe) { + /* + * "Plane N strech max must be programmed to 11b (x1) + * when Async flips are enabled on that plane." + */ + if (!IS_GEMINILAKE(dev_priv) && intel_vtd_active()) + intel_uncore_rmw(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe), + SKL_PLANE1_STRETCH_MAX_MASK, SKL_PLANE1_STRETCH_MAX_X1); + } + /* See Bspec note for PSR2_CTL bit 31, Wa#828:skl,bxt,kbl,cfl */ intel_uncore_write(&dev_priv->uncore, CHICKEN_PAR1_1, intel_uncore_read(&dev_priv->uncore, CHICKEN_PAR1_1) | SKL_EDP_PSR_FIX_RDWRAP); diff --git a/drivers/gpu/drm/kmb/kmb_crtc.c b/drivers/gpu/drm/kmb/kmb_crtc.c index 44327bc629ca..06613ffeaaf8 100644 --- a/drivers/gpu/drm/kmb/kmb_crtc.c +++ b/drivers/gpu/drm/kmb/kmb_crtc.c @@ -66,7 +66,8 @@ static const struct drm_crtc_funcs kmb_crtc_funcs = { .disable_vblank = kmb_crtc_disable_vblank, }; -static void kmb_crtc_set_mode(struct drm_crtc *crtc) +static void kmb_crtc_set_mode(struct drm_crtc *crtc, + struct drm_atomic_state *old_state) { struct drm_device *dev = crtc->dev; struct drm_display_mode *m = &crtc->state->adjusted_mode; @@ -75,7 +76,7 @@ static void kmb_crtc_set_mode(struct drm_crtc *crtc) unsigned int val = 0; /* Initialize mipi */ - kmb_dsi_mode_set(kmb->kmb_dsi, m, kmb->sys_clk_mhz); + kmb_dsi_mode_set(kmb->kmb_dsi, m, kmb->sys_clk_mhz, old_state); drm_info(dev, "vfp= %d vbp= %d vsync_len=%d hfp=%d hbp=%d hsync_len=%d\n", m->crtc_vsync_start - m->crtc_vdisplay, @@ -138,7 +139,7 @@ static void kmb_crtc_atomic_enable(struct drm_crtc *crtc, struct kmb_drm_private *kmb = crtc_to_kmb_priv(crtc); clk_prepare_enable(kmb->kmb_clk.clk_lcd); - kmb_crtc_set_mode(crtc); + kmb_crtc_set_mode(crtc, state); drm_crtc_vblank_on(crtc); } @@ -185,11 +186,45 @@ static void kmb_crtc_atomic_flush(struct drm_crtc *crtc, spin_unlock_irq(&crtc->dev->event_lock); } +static enum drm_mode_status + kmb_crtc_mode_valid(struct drm_crtc *crtc, + const struct drm_display_mode *mode) +{ + int refresh; + struct drm_device *dev = crtc->dev; + int vfp = mode->vsync_start - mode->vdisplay; + + if (mode->vdisplay < KMB_CRTC_MAX_HEIGHT) { + drm_dbg(dev, "height = %d less than %d", + mode->vdisplay, KMB_CRTC_MAX_HEIGHT); + return MODE_BAD_VVALUE; + } + if (mode->hdisplay < KMB_CRTC_MAX_WIDTH) { + drm_dbg(dev, "width = %d less than %d", + mode->hdisplay, KMB_CRTC_MAX_WIDTH); + return MODE_BAD_HVALUE; + } + refresh = drm_mode_vrefresh(mode); + if (refresh < KMB_MIN_VREFRESH || refresh > KMB_MAX_VREFRESH) { + drm_dbg(dev, "refresh = %d less than %d or greater than %d", + refresh, KMB_MIN_VREFRESH, KMB_MAX_VREFRESH); + return MODE_BAD; + } + + if (vfp < KMB_CRTC_MIN_VFP) { + drm_dbg(dev, "vfp = %d less than %d", vfp, KMB_CRTC_MIN_VFP); + return MODE_BAD; + } + + return MODE_OK; +} + static const struct drm_crtc_helper_funcs kmb_crtc_helper_funcs = { .atomic_begin = kmb_crtc_atomic_begin, .atomic_enable = kmb_crtc_atomic_enable, .atomic_disable = kmb_crtc_atomic_disable, .atomic_flush = kmb_crtc_atomic_flush, + .mode_valid = kmb_crtc_mode_valid, }; int kmb_setup_crtc(struct drm_device *drm) diff --git a/drivers/gpu/drm/kmb/kmb_drv.c b/drivers/gpu/drm/kmb/kmb_drv.c index 1c2f4799f421..961ac6fb5fcf 100644 --- a/drivers/gpu/drm/kmb/kmb_drv.c +++ b/drivers/gpu/drm/kmb/kmb_drv.c @@ -172,10 +172,10 @@ static int kmb_setup_mode_config(struct drm_device *drm) ret = drmm_mode_config_init(drm); if (ret) return ret; - drm->mode_config.min_width = KMB_MIN_WIDTH; - drm->mode_config.min_height = KMB_MIN_HEIGHT; - drm->mode_config.max_width = KMB_MAX_WIDTH; - drm->mode_config.max_height = KMB_MAX_HEIGHT; + drm->mode_config.min_width = KMB_FB_MIN_WIDTH; + drm->mode_config.min_height = KMB_FB_MIN_HEIGHT; + drm->mode_config.max_width = KMB_FB_MAX_WIDTH; + drm->mode_config.max_height = KMB_FB_MAX_HEIGHT; drm->mode_config.funcs = &kmb_mode_config_funcs; ret = kmb_setup_crtc(drm); @@ -380,7 +380,7 @@ static irqreturn_t handle_lcd_irq(struct drm_device *dev) if (val & LAYER3_DMA_FIFO_UNDERFLOW) drm_dbg(&kmb->drm, "LAYER3:GL1 DMA UNDERFLOW val = 0x%lx", val); - if (val & LAYER3_DMA_FIFO_UNDERFLOW) + if (val & LAYER3_DMA_FIFO_OVERFLOW) drm_dbg(&kmb->drm, "LAYER3:GL1 DMA OVERFLOW val = 0x%lx", val); } diff --git a/drivers/gpu/drm/kmb/kmb_drv.h b/drivers/gpu/drm/kmb/kmb_drv.h index ebbaa5f422d5..bf085e95b28f 100644 --- a/drivers/gpu/drm/kmb/kmb_drv.h +++ b/drivers/gpu/drm/kmb/kmb_drv.h @@ -20,6 +20,18 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 1 +/* Platform definitions */ +#define KMB_CRTC_MIN_VFP 4 +#define KMB_CRTC_MAX_WIDTH 1920 /* max width in pixels */ +#define KMB_CRTC_MAX_HEIGHT 1080 /* max height in pixels */ +#define KMB_CRTC_MIN_WIDTH 1920 +#define KMB_CRTC_MIN_HEIGHT 1080 +#define KMB_FB_MAX_WIDTH 1920 +#define KMB_FB_MAX_HEIGHT 1080 +#define KMB_FB_MIN_WIDTH 1 +#define KMB_FB_MIN_HEIGHT 1 +#define KMB_MIN_VREFRESH 59 /*vertical refresh in Hz */ +#define KMB_MAX_VREFRESH 60 /*vertical refresh in Hz */ #define KMB_LCD_DEFAULT_CLK 200000000 #define KMB_SYS_CLK_MHZ 500 @@ -45,6 +57,7 @@ struct kmb_drm_private { spinlock_t irq_lock; int irq_lcd; int sys_clk_mhz; + struct disp_cfg init_disp_cfg[KMB_MAX_PLANES]; struct layer_status plane_status[KMB_MAX_PLANES]; int kmb_under_flow; int kmb_flush_done; diff --git a/drivers/gpu/drm/kmb/kmb_dsi.c b/drivers/gpu/drm/kmb/kmb_dsi.c index 1793cd31b117..f6071882054c 100644 --- a/drivers/gpu/drm/kmb/kmb_dsi.c +++ b/drivers/gpu/drm/kmb/kmb_dsi.c @@ -482,6 +482,10 @@ static u32 mipi_tx_fg_section_cfg(struct kmb_dsi *kmb_dsi, return 0; } +#define CLK_DIFF_LOW 50 +#define CLK_DIFF_HI 60 +#define SYSCLK_500 500 + static void mipi_tx_fg_cfg_regs(struct kmb_dsi *kmb_dsi, u8 frame_gen, struct mipi_tx_frame_timing_cfg *fg_cfg) { @@ -492,7 +496,12 @@ static void mipi_tx_fg_cfg_regs(struct kmb_dsi *kmb_dsi, u8 frame_gen, /* 500 Mhz system clock minus 50 to account for the difference in * MIPI clock speed in RTL tests */ - sysclk = kmb_dsi->sys_clk_mhz - 50; + if (kmb_dsi->sys_clk_mhz == SYSCLK_500) { + sysclk = kmb_dsi->sys_clk_mhz - CLK_DIFF_LOW; + } else { + /* 700 Mhz clk*/ + sysclk = kmb_dsi->sys_clk_mhz - CLK_DIFF_HI; + } /* PPL-Pixel Packing Layer, LLP-Low Level Protocol * Frame genartor timing parameters are clocked on the system clock, @@ -1322,7 +1331,8 @@ static u32 mipi_tx_init_dphy(struct kmb_dsi *kmb_dsi, return 0; } -static void connect_lcd_to_mipi(struct kmb_dsi *kmb_dsi) +static void connect_lcd_to_mipi(struct kmb_dsi *kmb_dsi, + struct drm_atomic_state *old_state) { struct regmap *msscam; @@ -1331,7 +1341,7 @@ static void connect_lcd_to_mipi(struct kmb_dsi *kmb_dsi) dev_dbg(kmb_dsi->dev, "failed to get msscam syscon"); return; } - + drm_atomic_bridge_chain_enable(adv_bridge, old_state); /* DISABLE MIPI->CIF CONNECTION */ regmap_write(msscam, MSS_MIPI_CIF_CFG, 0); @@ -1342,7 +1352,7 @@ static void connect_lcd_to_mipi(struct kmb_dsi *kmb_dsi) } int kmb_dsi_mode_set(struct kmb_dsi *kmb_dsi, struct drm_display_mode *mode, - int sys_clk_mhz) + int sys_clk_mhz, struct drm_atomic_state *old_state) { u64 data_rate; @@ -1384,18 +1394,13 @@ int kmb_dsi_mode_set(struct kmb_dsi *kmb_dsi, struct drm_display_mode *mode, mipi_tx_init_cfg.lane_rate_mbps = data_rate; } - kmb_write_mipi(kmb_dsi, DPHY_ENABLE, 0); - kmb_write_mipi(kmb_dsi, DPHY_INIT_CTRL0, 0); - kmb_write_mipi(kmb_dsi, DPHY_INIT_CTRL1, 0); - kmb_write_mipi(kmb_dsi, DPHY_INIT_CTRL2, 0); - /* Initialize mipi controller */ mipi_tx_init_cntrl(kmb_dsi, &mipi_tx_init_cfg); /* Dphy initialization */ mipi_tx_init_dphy(kmb_dsi, &mipi_tx_init_cfg); - connect_lcd_to_mipi(kmb_dsi); + connect_lcd_to_mipi(kmb_dsi, old_state); dev_info(kmb_dsi->dev, "mipi hw initialized"); return 0; diff --git a/drivers/gpu/drm/kmb/kmb_dsi.h b/drivers/gpu/drm/kmb/kmb_dsi.h index 66b7c500d9bc..09dc88743d77 100644 --- a/drivers/gpu/drm/kmb/kmb_dsi.h +++ b/drivers/gpu/drm/kmb/kmb_dsi.h @@ -380,7 +380,7 @@ int kmb_dsi_host_bridge_init(struct device *dev); struct kmb_dsi *kmb_dsi_init(struct platform_device *pdev); void kmb_dsi_host_unregister(struct kmb_dsi *kmb_dsi); int kmb_dsi_mode_set(struct kmb_dsi *kmb_dsi, struct drm_display_mode *mode, - int sys_clk_mhz); + int sys_clk_mhz, struct drm_atomic_state *old_state); int kmb_dsi_map_mmio(struct kmb_dsi *kmb_dsi); int kmb_dsi_clk_init(struct kmb_dsi *kmb_dsi); int kmb_dsi_encoder_init(struct drm_device *dev, struct kmb_dsi *kmb_dsi); diff --git a/drivers/gpu/drm/kmb/kmb_plane.c b/drivers/gpu/drm/kmb/kmb_plane.c index ecee6782612d..00404ba4126d 100644 --- a/drivers/gpu/drm/kmb/kmb_plane.c +++ b/drivers/gpu/drm/kmb/kmb_plane.c @@ -67,8 +67,21 @@ static const u32 kmb_formats_v[] = { static unsigned int check_pixel_format(struct drm_plane *plane, u32 format) { + struct kmb_drm_private *kmb; + struct kmb_plane *kmb_plane = to_kmb_plane(plane); int i; + int plane_id = kmb_plane->id; + struct disp_cfg init_disp_cfg; + kmb = to_kmb(plane->dev); + init_disp_cfg = kmb->init_disp_cfg[plane_id]; + /* Due to HW limitations, changing pixel format after initial + * plane configuration is not supported. + */ + if (init_disp_cfg.format && init_disp_cfg.format != format) { + drm_dbg(&kmb->drm, "Cannot change format after initial plane configuration"); + return -EINVAL; + } for (i = 0; i < plane->format_count; i++) { if (plane->format_types[i] == format) return 0; @@ -81,11 +94,17 @@ static int kmb_plane_atomic_check(struct drm_plane *plane, { struct drm_plane_state *new_plane_state = drm_atomic_get_new_plane_state(state, plane); + struct kmb_drm_private *kmb; + struct kmb_plane *kmb_plane = to_kmb_plane(plane); + int plane_id = kmb_plane->id; + struct disp_cfg init_disp_cfg; struct drm_framebuffer *fb; int ret; struct drm_crtc_state *crtc_state; bool can_position; + kmb = to_kmb(plane->dev); + init_disp_cfg = kmb->init_disp_cfg[plane_id]; fb = new_plane_state->fb; if (!fb || !new_plane_state->crtc) return 0; @@ -94,10 +113,21 @@ static int kmb_plane_atomic_check(struct drm_plane *plane, if (ret) return ret; - if (new_plane_state->crtc_w > KMB_MAX_WIDTH || new_plane_state->crtc_h > KMB_MAX_HEIGHT) + if (new_plane_state->crtc_w > KMB_FB_MAX_WIDTH || + new_plane_state->crtc_h > KMB_FB_MAX_HEIGHT || + new_plane_state->crtc_w < KMB_FB_MIN_WIDTH || + new_plane_state->crtc_h < KMB_FB_MIN_HEIGHT) return -EINVAL; - if (new_plane_state->crtc_w < KMB_MIN_WIDTH || new_plane_state->crtc_h < KMB_MIN_HEIGHT) + + /* Due to HW limitations, changing plane height or width after + * initial plane configuration is not supported. + */ + if ((init_disp_cfg.width && init_disp_cfg.height) && + (init_disp_cfg.width != fb->width || + init_disp_cfg.height != fb->height)) { + drm_dbg(&kmb->drm, "Cannot change plane height or width after initial configuration"); return -EINVAL; + } can_position = (plane->type == DRM_PLANE_TYPE_OVERLAY); crtc_state = drm_atomic_get_existing_crtc_state(state, @@ -277,6 +307,44 @@ static void config_csc(struct kmb_drm_private *kmb, int plane_id) kmb_write_lcd(kmb, LCD_LAYERn_CSC_OFF3(plane_id), csc_coef_lcd[11]); } +static void kmb_plane_set_alpha(struct kmb_drm_private *kmb, + const struct drm_plane_state *state, + unsigned char plane_id, + unsigned int *val) +{ + u16 plane_alpha = state->alpha; + u16 pixel_blend_mode = state->pixel_blend_mode; + int has_alpha = state->fb->format->has_alpha; + + if (plane_alpha != DRM_BLEND_ALPHA_OPAQUE) + *val |= LCD_LAYER_ALPHA_STATIC; + + if (has_alpha) { + switch (pixel_blend_mode) { + case DRM_MODE_BLEND_PIXEL_NONE: + break; + case DRM_MODE_BLEND_PREMULTI: + *val |= LCD_LAYER_ALPHA_EMBED | LCD_LAYER_ALPHA_PREMULT; + break; + case DRM_MODE_BLEND_COVERAGE: + *val |= LCD_LAYER_ALPHA_EMBED; + break; + default: + DRM_DEBUG("Missing pixel blend mode case (%s == %ld)\n", + __stringify(pixel_blend_mode), + (long)pixel_blend_mode); + break; + } + } + + if (plane_alpha == DRM_BLEND_ALPHA_OPAQUE && !has_alpha) { + *val &= LCD_LAYER_ALPHA_DISABLED; + return; + } + + kmb_write_lcd(kmb, LCD_LAYERn_ALPHA(plane_id), plane_alpha); +} + static void kmb_plane_atomic_update(struct drm_plane *plane, struct drm_atomic_state *state) { @@ -296,6 +364,7 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, unsigned char plane_id; int num_planes; static dma_addr_t addr[MAX_SUB_PLANES]; + struct disp_cfg *init_disp_cfg; if (!plane || !new_plane_state || !old_plane_state) return; @@ -303,11 +372,12 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, fb = new_plane_state->fb; if (!fb) return; + num_planes = fb->format->num_planes; kmb_plane = to_kmb_plane(plane); - plane_id = kmb_plane->id; kmb = to_kmb(plane->dev); + plane_id = kmb_plane->id; spin_lock_irq(&kmb->irq_lock); if (kmb->kmb_under_flow || kmb->kmb_flush_done) { @@ -317,7 +387,8 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, } spin_unlock_irq(&kmb->irq_lock); - src_w = (new_plane_state->src_w >> 16); + init_disp_cfg = &kmb->init_disp_cfg[plane_id]; + src_w = new_plane_state->src_w >> 16; src_h = new_plane_state->src_h >> 16; crtc_x = new_plane_state->crtc_x; crtc_y = new_plane_state->crtc_y; @@ -400,20 +471,32 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, config_csc(kmb, plane_id); } + kmb_plane_set_alpha(kmb, plane->state, plane_id, &val); + kmb_write_lcd(kmb, LCD_LAYERn_CFG(plane_id), val); + /* Configure LCD_CONTROL */ + ctrl = kmb_read_lcd(kmb, LCD_CONTROL); + + /* Set layer blending config */ + ctrl &= ~LCD_CTRL_ALPHA_ALL; + ctrl |= LCD_CTRL_ALPHA_BOTTOM_VL1 | + LCD_CTRL_ALPHA_BLEND_VL2; + + ctrl &= ~LCD_CTRL_ALPHA_BLEND_BKGND_DISABLE; + switch (plane_id) { case LAYER_0: - ctrl = LCD_CTRL_VL1_ENABLE; + ctrl |= LCD_CTRL_VL1_ENABLE; break; case LAYER_1: - ctrl = LCD_CTRL_VL2_ENABLE; + ctrl |= LCD_CTRL_VL2_ENABLE; break; case LAYER_2: - ctrl = LCD_CTRL_GL1_ENABLE; + ctrl |= LCD_CTRL_GL1_ENABLE; break; case LAYER_3: - ctrl = LCD_CTRL_GL2_ENABLE; + ctrl |= LCD_CTRL_GL2_ENABLE; break; } @@ -425,7 +508,7 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, */ ctrl |= LCD_CTRL_VHSYNC_IDLE_LVL; - kmb_set_bitmask_lcd(kmb, LCD_CONTROL, ctrl); + kmb_write_lcd(kmb, LCD_CONTROL, ctrl); /* Enable pipeline AXI read transactions for the DMA * after setting graphics layers. This must be done @@ -448,6 +531,16 @@ static void kmb_plane_atomic_update(struct drm_plane *plane, /* Enable DMA */ kmb_write_lcd(kmb, LCD_LAYERn_DMA_CFG(plane_id), dma_cfg); + + /* Save initial display config */ + if (!init_disp_cfg->width || + !init_disp_cfg->height || + !init_disp_cfg->format) { + init_disp_cfg->width = width; + init_disp_cfg->height = height; + init_disp_cfg->format = fb->format->format; + } + drm_dbg(&kmb->drm, "dma_cfg=0x%x LCD_DMA_CFG=0x%x\n", dma_cfg, kmb_read_lcd(kmb, LCD_LAYERn_DMA_CFG(plane_id))); @@ -490,6 +583,9 @@ struct kmb_plane *kmb_plane_init(struct drm_device *drm) enum drm_plane_type plane_type; const u32 *plane_formats; int num_plane_formats; + unsigned int blend_caps = BIT(DRM_MODE_BLEND_PIXEL_NONE) | + BIT(DRM_MODE_BLEND_PREMULTI) | + BIT(DRM_MODE_BLEND_COVERAGE); for (i = 0; i < KMB_MAX_PLANES; i++) { plane = drmm_kzalloc(drm, sizeof(*plane), GFP_KERNEL); @@ -521,8 +617,16 @@ struct kmb_plane *kmb_plane_init(struct drm_device *drm) drm_dbg(drm, "%s : %d i=%d type=%d", __func__, __LINE__, i, plane_type); + drm_plane_create_alpha_property(&plane->base_plane); + + drm_plane_create_blend_mode_property(&plane->base_plane, + blend_caps); + + drm_plane_create_zpos_immutable_property(&plane->base_plane, i); + drm_plane_helper_add(&plane->base_plane, &kmb_plane_helper_funcs); + if (plane_type == DRM_PLANE_TYPE_PRIMARY) { primary = plane; kmb->plane = plane; diff --git a/drivers/gpu/drm/kmb/kmb_plane.h b/drivers/gpu/drm/kmb/kmb_plane.h index 486490f7a3ec..b51144044fe8 100644 --- a/drivers/gpu/drm/kmb/kmb_plane.h +++ b/drivers/gpu/drm/kmb/kmb_plane.h @@ -35,6 +35,9 @@ #define POSSIBLE_CRTCS 1 #define to_kmb_plane(x) container_of(x, struct kmb_plane, base_plane) +#define POSSIBLE_CRTCS 1 +#define KMB_MAX_PLANES 2 + enum layer_id { LAYER_0, LAYER_1, @@ -43,8 +46,6 @@ enum layer_id { /* KMB_MAX_PLANES */ }; -#define KMB_MAX_PLANES 1 - enum sub_plane_id { Y_PLANE, U_PLANE, @@ -62,6 +63,12 @@ struct layer_status { u32 ctrl; }; +struct disp_cfg { + unsigned int width; + unsigned int height; + unsigned int format; +}; + struct kmb_plane *kmb_plane_init(struct drm_device *drm); void kmb_plane_destroy(struct drm_plane *plane); #endif /* __KMB_PLANE_H__ */ diff --git a/drivers/gpu/drm/kmb/kmb_regs.h b/drivers/gpu/drm/kmb/kmb_regs.h index 48150569f702..9756101b0d32 100644 --- a/drivers/gpu/drm/kmb/kmb_regs.h +++ b/drivers/gpu/drm/kmb/kmb_regs.h @@ -43,8 +43,10 @@ #define LCD_CTRL_OUTPUT_ENABLED BIT(19) #define LCD_CTRL_BPORCH_ENABLE BIT(21) #define LCD_CTRL_FPORCH_ENABLE BIT(22) +#define LCD_CTRL_ALPHA_BLEND_BKGND_DISABLE BIT(23) #define LCD_CTRL_PIPELINE_DMA BIT(28) #define LCD_CTRL_VHSYNC_IDLE_LVL BIT(31) +#define LCD_CTRL_ALPHA_ALL (0xff << 6) /* interrupts */ #define LCD_INT_STATUS (0x4 * 0x001) @@ -115,6 +117,7 @@ #define LCD_LAYER_ALPHA_EMBED BIT(5) #define LCD_LAYER_ALPHA_COMBI (LCD_LAYER_ALPHA_STATIC | \ LCD_LAYER_ALPHA_EMBED) +#define LCD_LAYER_ALPHA_DISABLED ~(LCD_LAYER_ALPHA_COMBI) /* RGB multiplied with alpha */ #define LCD_LAYER_ALPHA_PREMULT BIT(6) #define LCD_LAYER_INVERT_COL BIT(7) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c index 5f81489fc60c..a4e80e499674 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c @@ -4,8 +4,6 @@ */ #include <linux/clk.h> -#include <linux/dma-mapping.h> -#include <linux/mailbox_controller.h> #include <linux/pm_runtime.h> #include <linux/soc/mediatek/mtk-cmdq.h> #include <linux/soc/mediatek/mtk-mmsys.h> @@ -52,11 +50,8 @@ struct mtk_drm_crtc { bool pending_async_planes; #if IS_REACHABLE(CONFIG_MTK_CMDQ) - struct mbox_client cmdq_cl; - struct mbox_chan *cmdq_chan; - struct cmdq_pkt cmdq_handle; + struct cmdq_client *cmdq_client; u32 cmdq_event; - u32 cmdq_vblank_cnt; #endif struct device *mmsys_dev; @@ -227,79 +222,9 @@ struct mtk_ddp_comp *mtk_drm_ddp_comp_for_plane(struct drm_crtc *crtc, } #if IS_REACHABLE(CONFIG_MTK_CMDQ) -static int mtk_drm_cmdq_pkt_create(struct mbox_chan *chan, struct cmdq_pkt *pkt, - size_t size) +static void ddp_cmdq_cb(struct cmdq_cb_data data) { - struct device *dev; - dma_addr_t dma_addr; - - pkt->va_base = kzalloc(size, GFP_KERNEL); - if (!pkt->va_base) { - kfree(pkt); - return -ENOMEM; - } - pkt->buf_size = size; - - dev = chan->mbox->dev; - dma_addr = dma_map_single(dev, pkt->va_base, pkt->buf_size, - DMA_TO_DEVICE); - if (dma_mapping_error(dev, dma_addr)) { - dev_err(dev, "dma map failed, size=%u\n", (u32)(u64)size); - kfree(pkt->va_base); - kfree(pkt); - return -ENOMEM; - } - - pkt->pa_base = dma_addr; - - return 0; -} - -static void mtk_drm_cmdq_pkt_destroy(struct mbox_chan *chan, struct cmdq_pkt *pkt) -{ - dma_unmap_single(chan->mbox->dev, pkt->pa_base, pkt->buf_size, - DMA_TO_DEVICE); - kfree(pkt->va_base); - kfree(pkt); -} - -static void ddp_cmdq_cb(struct mbox_client *cl, void *mssg) -{ - struct mtk_drm_crtc *mtk_crtc = container_of(cl, struct mtk_drm_crtc, cmdq_cl); - struct cmdq_cb_data *data = mssg; - struct mtk_crtc_state *state; - unsigned int i; - - state = to_mtk_crtc_state(mtk_crtc->base.state); - - state->pending_config = false; - - if (mtk_crtc->pending_planes) { - for (i = 0; i < mtk_crtc->layer_nr; i++) { - struct drm_plane *plane = &mtk_crtc->planes[i]; - struct mtk_plane_state *plane_state; - - plane_state = to_mtk_plane_state(plane->state); - - plane_state->pending.config = false; - } - mtk_crtc->pending_planes = false; - } - - if (mtk_crtc->pending_async_planes) { - for (i = 0; i < mtk_crtc->layer_nr; i++) { - struct drm_plane *plane = &mtk_crtc->planes[i]; - struct mtk_plane_state *plane_state; - - plane_state = to_mtk_plane_state(plane->state); - - plane_state->pending.async_config = false; - } - mtk_crtc->pending_async_planes = false; - } - - mtk_crtc->cmdq_vblank_cnt = 0; - mtk_drm_cmdq_pkt_destroy(mtk_crtc->cmdq_chan, data->pkt); + cmdq_pkt_destroy(data.data); } #endif @@ -453,8 +378,7 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc, state->pending_vrefresh, 0, cmdq_handle); - if (!cmdq_handle) - state->pending_config = false; + state->pending_config = false; } if (mtk_crtc->pending_planes) { @@ -474,12 +398,9 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc, mtk_ddp_comp_layer_config(comp, local_layer, plane_state, cmdq_handle); - if (!cmdq_handle) - plane_state->pending.config = false; + plane_state->pending.config = false; } - - if (!cmdq_handle) - mtk_crtc->pending_planes = false; + mtk_crtc->pending_planes = false; } if (mtk_crtc->pending_async_planes) { @@ -499,12 +420,9 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc, mtk_ddp_comp_layer_config(comp, local_layer, plane_state, cmdq_handle); - if (!cmdq_handle) - plane_state->pending.async_config = false; + plane_state->pending.async_config = false; } - - if (!cmdq_handle) - mtk_crtc->pending_async_planes = false; + mtk_crtc->pending_async_planes = false; } } @@ -512,7 +430,7 @@ static void mtk_drm_crtc_update_config(struct mtk_drm_crtc *mtk_crtc, bool needs_vblank) { #if IS_REACHABLE(CONFIG_MTK_CMDQ) - struct cmdq_pkt *cmdq_handle = &mtk_crtc->cmdq_handle; + struct cmdq_pkt *cmdq_handle; #endif struct drm_crtc *crtc = &mtk_crtc->base; struct mtk_drm_private *priv = crtc->dev->dev_private; @@ -550,24 +468,14 @@ static void mtk_drm_crtc_update_config(struct mtk_drm_crtc *mtk_crtc, mtk_mutex_release(mtk_crtc->mutex); } #if IS_REACHABLE(CONFIG_MTK_CMDQ) - if (mtk_crtc->cmdq_chan) { - mbox_flush(mtk_crtc->cmdq_chan, 2000); - cmdq_handle->cmd_buf_size = 0; + if (mtk_crtc->cmdq_client) { + mbox_flush(mtk_crtc->cmdq_client->chan, 2000); + cmdq_handle = cmdq_pkt_create(mtk_crtc->cmdq_client, PAGE_SIZE); cmdq_pkt_clear_event(cmdq_handle, mtk_crtc->cmdq_event); cmdq_pkt_wfe(cmdq_handle, mtk_crtc->cmdq_event, false); mtk_crtc_ddp_config(crtc, cmdq_handle); cmdq_pkt_finalize(cmdq_handle); - dma_sync_single_for_device(mtk_crtc->cmdq_chan->mbox->dev, - cmdq_handle->pa_base, - cmdq_handle->cmd_buf_size, - DMA_TO_DEVICE); - /* - * CMDQ command should execute in next vblank, - * If it fail to execute in next 2 vblank, timeout happen. - */ - mtk_crtc->cmdq_vblank_cnt = 2; - mbox_send_message(mtk_crtc->cmdq_chan, cmdq_handle); - mbox_client_txdone(mtk_crtc->cmdq_chan, 0); + cmdq_pkt_flush_async(cmdq_handle, ddp_cmdq_cb, cmdq_handle); } #endif mtk_crtc->config_updating = false; @@ -581,15 +489,12 @@ static void mtk_crtc_ddp_irq(void *data) struct mtk_drm_private *priv = crtc->dev->dev_private; #if IS_REACHABLE(CONFIG_MTK_CMDQ) - if (!priv->data->shadow_register && !mtk_crtc->cmdq_chan) - mtk_crtc_ddp_config(crtc, NULL); - else if (mtk_crtc->cmdq_vblank_cnt > 0 && --mtk_crtc->cmdq_vblank_cnt == 0) - DRM_ERROR("mtk_crtc %d CMDQ execute command timeout!\n", - drm_crtc_index(&mtk_crtc->base)); + if (!priv->data->shadow_register && !mtk_crtc->cmdq_client) #else if (!priv->data->shadow_register) - mtk_crtc_ddp_config(crtc, NULL); #endif + mtk_crtc_ddp_config(crtc, NULL); + mtk_drm_finish_page_flip(mtk_crtc); } @@ -924,20 +829,16 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev, mutex_init(&mtk_crtc->hw_lock); #if IS_REACHABLE(CONFIG_MTK_CMDQ) - mtk_crtc->cmdq_cl.dev = mtk_crtc->mmsys_dev; - mtk_crtc->cmdq_cl.tx_block = false; - mtk_crtc->cmdq_cl.knows_txdone = true; - mtk_crtc->cmdq_cl.rx_callback = ddp_cmdq_cb; - mtk_crtc->cmdq_chan = - mbox_request_channel(&mtk_crtc->cmdq_cl, - drm_crtc_index(&mtk_crtc->base)); - if (IS_ERR(mtk_crtc->cmdq_chan)) { + mtk_crtc->cmdq_client = + cmdq_mbox_create(mtk_crtc->mmsys_dev, + drm_crtc_index(&mtk_crtc->base)); + if (IS_ERR(mtk_crtc->cmdq_client)) { dev_dbg(dev, "mtk_crtc %d failed to create mailbox client, writing register by CPU now\n", drm_crtc_index(&mtk_crtc->base)); - mtk_crtc->cmdq_chan = NULL; + mtk_crtc->cmdq_client = NULL; } - if (mtk_crtc->cmdq_chan) { + if (mtk_crtc->cmdq_client) { ret = of_property_read_u32_index(priv->mutex_node, "mediatek,gce-events", drm_crtc_index(&mtk_crtc->base), @@ -945,18 +846,8 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev, if (ret) { dev_dbg(dev, "mtk_crtc %d failed to get mediatek,gce-events property\n", drm_crtc_index(&mtk_crtc->base)); - mbox_free_channel(mtk_crtc->cmdq_chan); - mtk_crtc->cmdq_chan = NULL; - } else { - ret = mtk_drm_cmdq_pkt_create(mtk_crtc->cmdq_chan, - &mtk_crtc->cmdq_handle, - PAGE_SIZE); - if (ret) { - dev_dbg(dev, "mtk_crtc %d failed to create cmdq packet\n", - drm_crtc_index(&mtk_crtc->base)); - mbox_free_channel(mtk_crtc->cmdq_chan); - mtk_crtc->cmdq_chan = NULL; - } + cmdq_mbox_destroy(mtk_crtc->cmdq_client); + mtk_crtc->cmdq_client = NULL; } } #endif diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig index e9c6af78b1d7..3ddf739a6f9b 100644 --- a/drivers/gpu/drm/msm/Kconfig +++ b/drivers/gpu/drm/msm/Kconfig @@ -17,7 +17,7 @@ config DRM_MSM select DRM_SCHED select SHMEM select TMPFS - select QCOM_SCM if ARCH_QCOM + select QCOM_SCM select WANT_DEV_COREDUMP select SND_SOC_HDMI_CODEC if SND_SOC select SYNC_FILE @@ -55,7 +55,7 @@ config DRM_MSM_GPU_SUDO config DRM_MSM_HDMI_HDCP bool "Enable HDMI HDCP support in MSM DRM driver" - depends on DRM_MSM && QCOM_SCM + depends on DRM_MSM default y help Choose this option to enable HDCP state machine diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c index 4534633fe7cd..8fb847c174ff 100644 --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c @@ -571,13 +571,14 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev) } icc_path = devm_of_icc_get(&pdev->dev, "gfx-mem"); - ret = IS_ERR(icc_path); - if (ret) + if (IS_ERR(icc_path)) { + ret = PTR_ERR(icc_path); goto fail; + } ocmem_icc_path = devm_of_icc_get(&pdev->dev, "ocmem"); - ret = IS_ERR(ocmem_icc_path); - if (ret) { + if (IS_ERR(ocmem_icc_path)) { + ret = PTR_ERR(ocmem_icc_path); /* allow -ENODATA, ocmem icc is optional */ if (ret != -ENODATA) goto fail; diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c index 82bebb40234d..a96ee79cc5e0 100644 --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c @@ -699,13 +699,14 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev) } icc_path = devm_of_icc_get(&pdev->dev, "gfx-mem"); - ret = IS_ERR(icc_path); - if (ret) + if (IS_ERR(icc_path)) { + ret = PTR_ERR(icc_path); goto fail; + } ocmem_icc_path = devm_of_icc_get(&pdev->dev, "ocmem"); - ret = IS_ERR(ocmem_icc_path); - if (ret) { + if (IS_ERR(ocmem_icc_path)) { + ret = PTR_ERR(ocmem_icc_path); /* allow -ENODATA, ocmem icc is optional */ if (ret != -ENODATA) goto fail; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index a7c58018959f..8b73f70766a4 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -296,6 +296,8 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) u32 val; int request, ack; + WARN_ON_ONCE(!mutex_is_locked(&gmu->lock)); + if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits)) return -EINVAL; @@ -337,6 +339,8 @@ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) { int bit; + WARN_ON_ONCE(!mutex_is_locked(&gmu->lock)); + if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits)) return; @@ -1482,6 +1486,8 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) if (!pdev) return -ENODEV; + mutex_init(&gmu->lock); + gmu->dev = &pdev->dev; of_dma_configure(gmu->dev, node, true); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index 3c74f64e3126..84bd516f01e8 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -44,6 +44,9 @@ struct a6xx_gmu_bo { struct a6xx_gmu { struct device *dev; + /* For serializing communication with the GMU: */ + struct mutex lock; + struct msm_gem_address_space *aspace; void * __iomem mmio; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 40c9fef457a4..267a880811d6 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -106,7 +106,7 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu, u32 asid; u64 memptr = rbmemptr(ring, ttbr0); - if (ctx == a6xx_gpu->cur_ctx) + if (ctx->seqno == a6xx_gpu->cur_ctx_seqno) return; if (msm_iommu_pagetable_params(ctx->aspace->mmu, &ttbr, &asid)) @@ -139,7 +139,7 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu, OUT_PKT7(ring, CP_EVENT_WRITE, 1); OUT_RING(ring, 0x31); - a6xx_gpu->cur_ctx = ctx; + a6xx_gpu->cur_ctx_seqno = ctx->seqno; } static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) @@ -881,7 +881,7 @@ static int a6xx_zap_shader_init(struct msm_gpu *gpu) A6XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \ A6XX_RBBM_INT_0_MASK_UCHE_TRAP_INTR) -static int a6xx_hw_init(struct msm_gpu *gpu) +static int hw_init(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); @@ -1081,7 +1081,7 @@ static int a6xx_hw_init(struct msm_gpu *gpu) /* Always come up on rb 0 */ a6xx_gpu->cur_ring = gpu->rb[0]; - a6xx_gpu->cur_ctx = NULL; + a6xx_gpu->cur_ctx_seqno = 0; /* Enable the SQE_to start the CP engine */ gpu_write(gpu, REG_A6XX_CP_SQE_CNTL, 1); @@ -1135,6 +1135,19 @@ out: return ret; } +static int a6xx_hw_init(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + int ret; + + mutex_lock(&a6xx_gpu->gmu.lock); + ret = hw_init(gpu); + mutex_unlock(&a6xx_gpu->gmu.lock); + + return ret; +} + static void a6xx_dump(struct msm_gpu *gpu) { DRM_DEV_INFO(&gpu->pdev->dev, "status: %08x\n", @@ -1509,7 +1522,9 @@ static int a6xx_pm_resume(struct msm_gpu *gpu) trace_msm_gpu_resume(0); + mutex_lock(&a6xx_gpu->gmu.lock); ret = a6xx_gmu_resume(a6xx_gpu); + mutex_unlock(&a6xx_gpu->gmu.lock); if (ret) return ret; @@ -1532,7 +1547,9 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) msm_devfreq_suspend(gpu); + mutex_lock(&a6xx_gpu->gmu.lock); ret = a6xx_gmu_stop(a6xx_gpu); + mutex_unlock(&a6xx_gpu->gmu.lock); if (ret) return ret; @@ -1547,18 +1564,19 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); - static DEFINE_MUTEX(perfcounter_oob); - mutex_lock(&perfcounter_oob); + mutex_lock(&a6xx_gpu->gmu.lock); /* Force the GPU power on so we can read this register */ a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); *value = gpu_read64(gpu, REG_A6XX_CP_ALWAYS_ON_COUNTER_LO, - REG_A6XX_CP_ALWAYS_ON_COUNTER_HI); + REG_A6XX_CP_ALWAYS_ON_COUNTER_HI); a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); - mutex_unlock(&perfcounter_oob); + + mutex_unlock(&a6xx_gpu->gmu.lock); + return 0; } @@ -1622,6 +1640,16 @@ static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu) return (unsigned long)busy_time; } +void a6xx_gpu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + + mutex_lock(&a6xx_gpu->gmu.lock); + a6xx_gmu_set_freq(gpu, opp); + mutex_unlock(&a6xx_gpu->gmu.lock); +} + static struct msm_gem_address_space * a6xx_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) { @@ -1766,7 +1794,7 @@ static const struct adreno_gpu_funcs funcs = { #endif .gpu_busy = a6xx_gpu_busy, .gpu_get_freq = a6xx_gmu_get_freq, - .gpu_set_freq = a6xx_gmu_set_freq, + .gpu_set_freq = a6xx_gpu_set_freq, #if defined(CONFIG_DRM_MSM_GPU_STATE) .gpu_state_get = a6xx_gpu_state_get, .gpu_state_put = a6xx_gpu_state_put, @@ -1810,6 +1838,13 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev) adreno_cmp_rev(ADRENO_REV(6, 3, 5, ANY_ID), info->rev))) adreno_gpu->base.hw_apriv = true; + /* + * For now only clamp to idle freq for devices where this is known not + * to cause power supply issues: + */ + if (info && (info->revn == 618)) + gpu->clamp_to_idle = true; + a6xx_llc_slices_init(pdev, a6xx_gpu); ret = a6xx_set_supported_hw(&pdev->dev, config->rev); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 0bc2d062f54a..8e5527c881b1 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -19,7 +19,16 @@ struct a6xx_gpu { uint64_t sqe_iova; struct msm_ringbuffer *cur_ring; - struct msm_file_private *cur_ctx; + + /** + * cur_ctx_seqno: + * + * The ctx->seqno value of the context with current pgtables + * installed. Tracked by seqno rather than pointer value to + * avoid dangling pointers, and cases where a ctx can be freed + * and a new one created with the same address. + */ + int cur_ctx_seqno; struct a6xx_gmu gmu; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index b131fd376192..700d65e39feb 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -794,7 +794,7 @@ static const struct dpu_pingpong_cfg sm8150_pp[] = { DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR2, 30), -1), PP_BLK("pingpong_5", PINGPONG_5, 0x72800, MERGE_3D_2, sdm845_pp_sblk, - DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR2, 30), + DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR2, 31), -1), }; diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c index f482e0911d03..bb7d066618e6 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c @@ -1125,6 +1125,20 @@ static void mdp5_crtc_reset(struct drm_crtc *crtc) __drm_atomic_helper_crtc_reset(crtc, &mdp5_cstate->base); } +static const struct drm_crtc_funcs mdp5_crtc_no_lm_cursor_funcs = { + .set_config = drm_atomic_helper_set_config, + .destroy = mdp5_crtc_destroy, + .page_flip = drm_atomic_helper_page_flip, + .reset = mdp5_crtc_reset, + .atomic_duplicate_state = mdp5_crtc_duplicate_state, + .atomic_destroy_state = mdp5_crtc_destroy_state, + .atomic_print_state = mdp5_crtc_atomic_print_state, + .get_vblank_counter = mdp5_crtc_get_vblank_counter, + .enable_vblank = msm_crtc_enable_vblank, + .disable_vblank = msm_crtc_disable_vblank, + .get_vblank_timestamp = drm_crtc_vblank_helper_get_vblank_timestamp, +}; + static const struct drm_crtc_funcs mdp5_crtc_funcs = { .set_config = drm_atomic_helper_set_config, .destroy = mdp5_crtc_destroy, @@ -1313,6 +1327,8 @@ struct drm_crtc *mdp5_crtc_init(struct drm_device *dev, mdp5_crtc->lm_cursor_enabled = cursor_plane ? false : true; drm_crtc_init_with_planes(dev, crtc, plane, cursor_plane, + cursor_plane ? + &mdp5_crtc_no_lm_cursor_funcs : &mdp5_crtc_funcs, NULL); drm_flip_work_init(&mdp5_crtc->unref_cursor_work, diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index fbe4c2cd52a3..a0392e4d8134 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -1309,14 +1309,14 @@ static int dp_pm_resume(struct device *dev) * can not declared display is connected unless * HDMI cable is plugged in and sink_count of * dongle become 1 + * also only signal audio when disconnected */ - if (dp->link->sink_count) + if (dp->link->sink_count) { dp->dp_display.is_connected = true; - else + } else { dp->dp_display.is_connected = false; - - dp_display_handle_plugged_change(g_dp_display, - dp->dp_display.is_connected); + dp_display_handle_plugged_change(g_dp_display, false); + } DRM_DEBUG_DP("After, sink_count=%d is_connected=%d core_inited=%d power_on=%d\n", dp->link->sink_count, dp->dp_display.is_connected, diff --git a/drivers/gpu/drm/msm/dsi/dsi.c b/drivers/gpu/drm/msm/dsi/dsi.c index 614dc7f26f2c..75ae3008b68f 100644 --- a/drivers/gpu/drm/msm/dsi/dsi.c +++ b/drivers/gpu/drm/msm/dsi/dsi.c @@ -215,8 +215,10 @@ int msm_dsi_modeset_init(struct msm_dsi *msm_dsi, struct drm_device *dev, goto fail; } - if (!msm_dsi_manager_validate_current_config(msm_dsi->id)) + if (!msm_dsi_manager_validate_current_config(msm_dsi->id)) { + ret = -EINVAL; goto fail; + } msm_dsi->encoder = encoder; diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c b/drivers/gpu/drm/msm/dsi/dsi_host.c index e269df285136..c86b5090fae6 100644 --- a/drivers/gpu/drm/msm/dsi/dsi_host.c +++ b/drivers/gpu/drm/msm/dsi/dsi_host.c @@ -451,7 +451,7 @@ static int dsi_bus_clk_enable(struct msm_dsi_host *msm_host) return 0; err: - for (; i > 0; i--) + while (--i >= 0) clk_disable_unprepare(msm_host->bus_clks[i]); return ret; diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c index d13552b2213b..5b4e991f220d 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c @@ -110,14 +110,13 @@ static struct dsi_pll_14nm *pll_14nm_list[DSI_MAX]; static bool pll_14nm_poll_for_ready(struct dsi_pll_14nm *pll_14nm, u32 nb_tries, u32 timeout_us) { - bool pll_locked = false; + bool pll_locked = false, pll_ready = false; void __iomem *base = pll_14nm->phy->pll_base; u32 tries, val; tries = nb_tries; while (tries--) { - val = dsi_phy_read(base + - REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); + val = dsi_phy_read(base + REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); pll_locked = !!(val & BIT(5)); if (pll_locked) @@ -126,23 +125,24 @@ static bool pll_14nm_poll_for_ready(struct dsi_pll_14nm *pll_14nm, udelay(timeout_us); } - if (!pll_locked) { - tries = nb_tries; - while (tries--) { - val = dsi_phy_read(base + - REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); - pll_locked = !!(val & BIT(0)); + if (!pll_locked) + goto out; - if (pll_locked) - break; + tries = nb_tries; + while (tries--) { + val = dsi_phy_read(base + REG_DSI_14nm_PHY_PLL_RESET_SM_READY_STATUS); + pll_ready = !!(val & BIT(0)); - udelay(timeout_us); - } + if (pll_ready) + break; + + udelay(timeout_us); } - DBG("DSI PLL is %slocked", pll_locked ? "" : "*not* "); +out: + DBG("DSI PLL is %slocked, %sready", pll_locked ? "" : "*not* ", pll_ready ? "" : "*not* "); - return pll_locked; + return pll_locked && pll_ready; } static void dsi_pll_14nm_config_init(struct dsi_pll_config *pconf) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c index aaa37456f4ee..71ed4aa0dc67 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c @@ -428,7 +428,7 @@ static int pll_28nm_register(struct dsi_pll_28nm *pll_28nm, struct clk_hw **prov bytediv->reg = pll_28nm->phy->pll_base + REG_DSI_28nm_8960_PHY_PLL_CTRL_9; snprintf(parent_name, 32, "dsi%dvco_clk", pll_28nm->phy->id); - snprintf(clk_name, 32, "dsi%dpllbyte", pll_28nm->phy->id); + snprintf(clk_name, 32, "dsi%dpllbyte", pll_28nm->phy->id + 1); bytediv_init.name = clk_name; bytediv_init.ops = &clk_bytediv_ops; @@ -442,7 +442,7 @@ static int pll_28nm_register(struct dsi_pll_28nm *pll_28nm, struct clk_hw **prov return ret; provided_clocks[DSI_BYTE_PLL_CLK] = &bytediv->hw; - snprintf(clk_name, 32, "dsi%dpll", pll_28nm->phy->id); + snprintf(clk_name, 32, "dsi%dpll", pll_28nm->phy->id + 1); /* DIV3 */ hw = devm_clk_hw_register_divider(dev, clk_name, parent_name, 0, pll_28nm->phy->pll_base + diff --git a/drivers/gpu/drm/msm/edp/edp_ctrl.c b/drivers/gpu/drm/msm/edp/edp_ctrl.c index 4fb397ee7c84..fe1366b4c49f 100644 --- a/drivers/gpu/drm/msm/edp/edp_ctrl.c +++ b/drivers/gpu/drm/msm/edp/edp_ctrl.c @@ -1116,7 +1116,7 @@ void msm_edp_ctrl_power(struct edp_ctrl *ctrl, bool on) int msm_edp_ctrl_init(struct msm_edp *edp) { struct edp_ctrl *ctrl = NULL; - struct device *dev = &edp->pdev->dev; + struct device *dev; int ret; if (!edp) { @@ -1124,6 +1124,7 @@ int msm_edp_ctrl_init(struct msm_edp *edp) return -EINVAL; } + dev = &edp->pdev->dev; ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL); if (!ctrl) return -ENOMEM; diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 2e6fc185e54d..d4e09703a87d 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -630,10 +630,11 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) if (ret) goto err_msm_uninit; - ret = msm_disp_snapshot_init(ddev); - if (ret) - DRM_DEV_ERROR(dev, "msm_disp_snapshot_init failed ret = %d\n", ret); - + if (kms) { + ret = msm_disp_snapshot_init(ddev); + if (ret) + DRM_DEV_ERROR(dev, "msm_disp_snapshot_init failed ret = %d\n", ret); + } drm_mode_config_reset(ddev); #ifdef CONFIG_DRM_FBDEV_EMULATION @@ -682,6 +683,7 @@ static void load_gpu(struct drm_device *dev) static int context_init(struct drm_device *dev, struct drm_file *file) { + static atomic_t ident = ATOMIC_INIT(0); struct msm_drm_private *priv = dev->dev_private; struct msm_file_private *ctx; @@ -689,12 +691,17 @@ static int context_init(struct drm_device *dev, struct drm_file *file) if (!ctx) return -ENOMEM; + INIT_LIST_HEAD(&ctx->submitqueues); + rwlock_init(&ctx->queuelock); + kref_init(&ctx->ref); msm_submitqueue_init(dev, ctx); ctx->aspace = msm_gpu_create_private_address_space(priv->gpu, current); file->driver_priv = ctx; + ctx->seqno = atomic_inc_return(&ident); + return 0; } diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 8b005d1ac899..c552f0c3890c 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -53,14 +53,6 @@ struct msm_disp_state; #define FRAC_16_16(mult, div) (((mult) << 16) / (div)) -struct msm_file_private { - rwlock_t queuelock; - struct list_head submitqueues; - int queueid; - struct msm_gem_address_space *aspace; - struct kref ref; -}; - enum msm_mdp_plane_property { PLANE_PROP_ZPOS, PLANE_PROP_ALPHA, @@ -488,41 +480,6 @@ void msm_writel(u32 data, void __iomem *addr); u32 msm_readl(const void __iomem *addr); void msm_rmw(void __iomem *addr, u32 mask, u32 or); -struct msm_gpu_submitqueue; -int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx); -struct msm_gpu_submitqueue *msm_submitqueue_get(struct msm_file_private *ctx, - u32 id); -int msm_submitqueue_create(struct drm_device *drm, - struct msm_file_private *ctx, - u32 prio, u32 flags, u32 *id); -int msm_submitqueue_query(struct drm_device *drm, struct msm_file_private *ctx, - struct drm_msm_submitqueue_query *args); -int msm_submitqueue_remove(struct msm_file_private *ctx, u32 id); -void msm_submitqueue_close(struct msm_file_private *ctx); - -void msm_submitqueue_destroy(struct kref *kref); - -static inline void __msm_file_private_destroy(struct kref *kref) -{ - struct msm_file_private *ctx = container_of(kref, - struct msm_file_private, ref); - - msm_gem_address_space_put(ctx->aspace); - kfree(ctx); -} - -static inline void msm_file_private_put(struct msm_file_private *ctx) -{ - kref_put(&ctx->ref, __msm_file_private_destroy); -} - -static inline struct msm_file_private *msm_file_private_get( - struct msm_file_private *ctx) -{ - kref_get(&ctx->ref); - return ctx; -} - #define DBG(fmt, ...) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__) #define VERB(fmt, ...) if (0) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__) @@ -547,7 +504,7 @@ static inline int align_pitch(int width, int bpp) static inline unsigned long timeout_to_jiffies(const ktime_t *timeout) { ktime_t now = ktime_get(); - unsigned long remaining_jiffies; + s64 remaining_jiffies; if (ktime_compare(*timeout, now) < 0) { remaining_jiffies = 0; @@ -556,7 +513,7 @@ static inline unsigned long timeout_to_jiffies(const ktime_t *timeout) remaining_jiffies = ktime_divns(rem, NSEC_PER_SEC / HZ); } - return remaining_jiffies; + return clamp(remaining_jiffies, 0LL, (s64)INT_MAX); } #endif /* __MSM_DRV_H__ */ diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index fdc5367aecaa..151d19e4453c 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -46,7 +46,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, if (!submit) return ERR_PTR(-ENOMEM); - ret = drm_sched_job_init(&submit->base, &queue->entity, queue); + ret = drm_sched_job_init(&submit->base, queue->entity, queue); if (ret) { kfree(submit); return ERR_PTR(ret); @@ -171,7 +171,8 @@ out: static int submit_lookup_cmds(struct msm_gem_submit *submit, struct drm_msm_gem_submit *args, struct drm_file *file) { - unsigned i, sz; + unsigned i; + size_t sz; int ret = 0; for (i = 0; i < args->nr_cmds; i++) { @@ -907,7 +908,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, /* The scheduler owns a ref now: */ msm_gem_submit_get(submit); - drm_sched_entity_push_job(&submit->base, &queue->entity); + drm_sched_entity_push_job(&submit->base, queue->entity); args->fence = submit->fence_id; diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 0e4b45bff2e6..ee25d556c8a1 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -203,6 +203,10 @@ struct msm_gpu { uint32_t suspend_count; struct msm_gpu_state *crashstate; + + /* Enable clamping to idle freq when inactive: */ + bool clamp_to_idle; + /* True if the hardware supports expanded apriv (a650 and newer) */ bool hw_apriv; @@ -258,6 +262,39 @@ struct msm_gpu_perfcntr { #define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_HIGH - DRM_SCHED_PRIORITY_MIN) /** + * struct msm_file_private - per-drm_file context + * + * @queuelock: synchronizes access to submitqueues list + * @submitqueues: list of &msm_gpu_submitqueue created by userspace + * @queueid: counter incremented each time a submitqueue is created, + * used to assign &msm_gpu_submitqueue.id + * @aspace: the per-process GPU address-space + * @ref: reference count + * @seqno: unique per process seqno + */ +struct msm_file_private { + rwlock_t queuelock; + struct list_head submitqueues; + int queueid; + struct msm_gem_address_space *aspace; + struct kref ref; + int seqno; + + /** + * entities: + * + * Table of per-priority-level sched entities used by submitqueues + * associated with this &drm_file. Because some userspace apps + * make assumptions about rendering from multiple gl contexts + * (of the same priority) within the process happening in FIFO + * order without requiring any fencing beyond MakeCurrent(), we + * create at most one &drm_sched_entity per-process per-priority- + * level. + */ + struct drm_sched_entity *entities[NR_SCHED_PRIORITIES * MSM_GPU_MAX_RINGS]; +}; + +/** * msm_gpu_convert_priority - Map userspace priority to ring # and sched priority * * @gpu: the gpu instance @@ -304,6 +341,8 @@ static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio, } /** + * struct msm_gpu_submitqueues - Userspace created context. + * * A submitqueue is associated with a gl context or vk queue (or equiv) * in userspace. * @@ -321,7 +360,7 @@ static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio, * seqno, protected by submitqueue lock * @lock: submitqueue lock * @ref: reference count - * @entity: the submit job-queue + * @entity: the submit job-queue */ struct msm_gpu_submitqueue { int id; @@ -333,7 +372,7 @@ struct msm_gpu_submitqueue { struct idr fence_idr; struct mutex lock; struct kref ref; - struct drm_sched_entity entity; + struct drm_sched_entity *entity; }; struct msm_gpu_state_bo { @@ -421,6 +460,33 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val) int msm_gpu_pm_suspend(struct msm_gpu *gpu); int msm_gpu_pm_resume(struct msm_gpu *gpu); +int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx); +struct msm_gpu_submitqueue *msm_submitqueue_get(struct msm_file_private *ctx, + u32 id); +int msm_submitqueue_create(struct drm_device *drm, + struct msm_file_private *ctx, + u32 prio, u32 flags, u32 *id); +int msm_submitqueue_query(struct drm_device *drm, struct msm_file_private *ctx, + struct drm_msm_submitqueue_query *args); +int msm_submitqueue_remove(struct msm_file_private *ctx, u32 id); +void msm_submitqueue_close(struct msm_file_private *ctx); + +void msm_submitqueue_destroy(struct kref *kref); + +void __msm_file_private_destroy(struct kref *kref); + +static inline void msm_file_private_put(struct msm_file_private *ctx) +{ + kref_put(&ctx->ref, __msm_file_private_destroy); +} + +static inline struct msm_file_private *msm_file_private_get( + struct msm_file_private *ctx) +{ + kref_get(&ctx->ref); + return ctx; +} + void msm_devfreq_init(struct msm_gpu *gpu); void msm_devfreq_cleanup(struct msm_gpu *gpu); void msm_devfreq_resume(struct msm_gpu *gpu); diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c b/drivers/gpu/drm/msm/msm_gpu_devfreq.c index 0a1ee20296a2..20006d060b5b 100644 --- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c +++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c @@ -151,6 +151,9 @@ void msm_devfreq_active(struct msm_gpu *gpu) unsigned int idle_time; unsigned long target_freq = df->idle_freq; + if (!df->devfreq) + return; + /* * Hold devfreq lock to synchronize with get_dev_status()/ * target() callbacks @@ -186,6 +189,9 @@ void msm_devfreq_idle(struct msm_gpu *gpu) struct msm_gpu_devfreq *df = &gpu->devfreq; unsigned long idle_freq, target_freq = 0; + if (!df->devfreq) + return; + /* * Hold devfreq lock to synchronize with get_dev_status()/ * target() callbacks @@ -194,7 +200,8 @@ void msm_devfreq_idle(struct msm_gpu *gpu) idle_freq = get_freq(gpu); - msm_devfreq_target(&gpu->pdev->dev, &target_freq, 0); + if (gpu->clamp_to_idle) + msm_devfreq_target(&gpu->pdev->dev, &target_freq, 0); df->idle_time = ktime_get(); df->idle_freq = idle_freq; diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c index 32a55d81b58b..b8621c6e0554 100644 --- a/drivers/gpu/drm/msm/msm_submitqueue.c +++ b/drivers/gpu/drm/msm/msm_submitqueue.c @@ -7,6 +7,24 @@ #include "msm_gpu.h" +void __msm_file_private_destroy(struct kref *kref) +{ + struct msm_file_private *ctx = container_of(kref, + struct msm_file_private, ref); + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->entities); i++) { + if (!ctx->entities[i]) + continue; + + drm_sched_entity_destroy(ctx->entities[i]); + kfree(ctx->entities[i]); + } + + msm_gem_address_space_put(ctx->aspace); + kfree(ctx); +} + void msm_submitqueue_destroy(struct kref *kref) { struct msm_gpu_submitqueue *queue = container_of(kref, @@ -14,8 +32,6 @@ void msm_submitqueue_destroy(struct kref *kref) idr_destroy(&queue->fence_idr); - drm_sched_entity_destroy(&queue->entity); - msm_file_private_put(queue->ctx); kfree(queue); @@ -61,13 +77,47 @@ void msm_submitqueue_close(struct msm_file_private *ctx) } } +static struct drm_sched_entity * +get_sched_entity(struct msm_file_private *ctx, struct msm_ringbuffer *ring, + unsigned ring_nr, enum drm_sched_priority sched_prio) +{ + static DEFINE_MUTEX(entity_lock); + unsigned idx = (ring_nr * NR_SCHED_PRIORITIES) + sched_prio; + + /* We should have already validated that the requested priority is + * valid by the time we get here. + */ + if (WARN_ON(idx >= ARRAY_SIZE(ctx->entities))) + return ERR_PTR(-EINVAL); + + mutex_lock(&entity_lock); + + if (!ctx->entities[idx]) { + struct drm_sched_entity *entity; + struct drm_gpu_scheduler *sched = &ring->sched; + int ret; + + entity = kzalloc(sizeof(*ctx->entities[idx]), GFP_KERNEL); + + ret = drm_sched_entity_init(entity, sched_prio, &sched, 1, NULL); + if (ret) { + kfree(entity); + return ERR_PTR(ret); + } + + ctx->entities[idx] = entity; + } + + mutex_unlock(&entity_lock); + + return ctx->entities[idx]; +} + int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, u32 prio, u32 flags, u32 *id) { struct msm_drm_private *priv = drm->dev_private; struct msm_gpu_submitqueue *queue; - struct msm_ringbuffer *ring; - struct drm_gpu_scheduler *sched; enum drm_sched_priority sched_prio; unsigned ring_nr; int ret; @@ -91,12 +141,10 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, queue->flags = flags; queue->ring_nr = ring_nr; - ring = priv->gpu->rb[ring_nr]; - sched = &ring->sched; - - ret = drm_sched_entity_init(&queue->entity, - sched_prio, &sched, 1, NULL); - if (ret) { + queue->entity = get_sched_entity(ctx, priv->gpu->rb[ring_nr], + ring_nr, sched_prio); + if (IS_ERR(queue->entity)) { + ret = PTR_ERR(queue->entity); kfree(queue); return ret; } @@ -140,10 +188,6 @@ int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx) */ default_prio = DIV_ROUND_UP(max_priority, 2); - INIT_LIST_HEAD(&ctx->submitqueues); - - rwlock_init(&ctx->queuelock); - return msm_submitqueue_create(drm, ctx, default_prio, 0, NULL); } diff --git a/drivers/gpu/drm/mxsfb/mxsfb_drv.c b/drivers/gpu/drm/mxsfb/mxsfb_drv.c index ec0432fe1bdf..86d78634a979 100644 --- a/drivers/gpu/drm/mxsfb/mxsfb_drv.c +++ b/drivers/gpu/drm/mxsfb/mxsfb_drv.c @@ -173,7 +173,11 @@ static void mxsfb_irq_disable(struct drm_device *drm) struct mxsfb_drm_private *mxsfb = drm->dev_private; mxsfb_enable_axi_clk(mxsfb); - mxsfb->crtc.funcs->disable_vblank(&mxsfb->crtc); + + /* Disable and clear VBLANK IRQ */ + writel(CTRL1_CUR_FRAME_DONE_IRQ_EN, mxsfb->base + LCDC_CTRL1 + REG_CLR); + writel(CTRL1_CUR_FRAME_DONE_IRQ, mxsfb->base + LCDC_CTRL1 + REG_CLR); + mxsfb_disable_axi_clk(mxsfb); } diff --git a/drivers/gpu/drm/nouveau/dispnv50/crc.c b/drivers/gpu/drm/nouveau/dispnv50/crc.c index b8c31b697797..66f32d965c72 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/crc.c +++ b/drivers/gpu/drm/nouveau/dispnv50/crc.c @@ -704,6 +704,7 @@ static const struct file_operations nv50_crc_flip_threshold_fops = { .open = nv50_crc_debugfs_flip_threshold_open, .read = seq_read, .write = nv50_crc_debugfs_flip_threshold_set, + .release = single_release, }; int nv50_head_crc_late_register(struct nv50_head *head) diff --git a/drivers/gpu/drm/nouveau/dispnv50/head.c b/drivers/gpu/drm/nouveau/dispnv50/head.c index d66f97280282..72099d1e4816 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/head.c +++ b/drivers/gpu/drm/nouveau/dispnv50/head.c @@ -52,6 +52,7 @@ nv50_head_flush_clr(struct nv50_head *head, void nv50_head_flush_set_wndw(struct nv50_head *head, struct nv50_head_atom *asyh) { + if (asyh->set.curs ) head->func->curs_set(head, asyh); if (asyh->set.olut ) { asyh->olut.offset = nv50_lut_load(&head->olut, asyh->olut.buffer, @@ -67,7 +68,6 @@ nv50_head_flush_set(struct nv50_head *head, struct nv50_head_atom *asyh) if (asyh->set.view ) head->func->view (head, asyh); if (asyh->set.mode ) head->func->mode (head, asyh); if (asyh->set.core ) head->func->core_set(head, asyh); - if (asyh->set.curs ) head->func->curs_set(head, asyh); if (asyh->set.base ) head->func->base (head, asyh); if (asyh->set.ovly ) head->func->ovly (head, asyh); if (asyh->set.dither ) head->func->dither (head, asyh); diff --git a/drivers/gpu/drm/nouveau/include/nvif/class.h b/drivers/gpu/drm/nouveau/include/nvif/class.h index c68cc957248e..a582c0cb0cb0 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/class.h +++ b/drivers/gpu/drm/nouveau/include/nvif/class.h @@ -71,6 +71,7 @@ #define PASCAL_CHANNEL_GPFIFO_A /* cla06f.h */ 0x0000c06f #define VOLTA_CHANNEL_GPFIFO_A /* clc36f.h */ 0x0000c36f #define TURING_CHANNEL_GPFIFO_A /* clc36f.h */ 0x0000c46f +#define AMPERE_CHANNEL_GPFIFO_B /* clc36f.h */ 0x0000c76f #define NV50_DISP /* cl5070.h */ 0x00005070 #define G82_DISP /* cl5070.h */ 0x00008270 @@ -200,6 +201,7 @@ #define PASCAL_DMA_COPY_B 0x0000c1b5 #define VOLTA_DMA_COPY_A 0x0000c3b5 #define TURING_DMA_COPY_A 0x0000c5b5 +#define AMPERE_DMA_COPY_B 0x0000c7b5 #define FERMI_DECOMPRESS 0x000090b8 diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h index 54fab7cc36c1..64ee82c7c1be 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h @@ -77,4 +77,5 @@ int gp100_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct int gp10b_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); int gv100_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); int tu102_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); +int ga102_fifo_new(struct nvkm_device *, enum nvkm_subdev_type, int inst, struct nvkm_fifo **); #endif diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 6d07e653f82d..c58bcdba2c7a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -844,6 +844,7 @@ nouveau_bo_move_init(struct nouveau_drm *drm) struct ttm_resource *, struct ttm_resource *); int (*init)(struct nouveau_channel *, u32 handle); } _methods[] = { + { "COPY", 4, 0xc7b5, nve0_bo_move_copy, nve0_bo_move_init }, { "COPY", 4, 0xc5b5, nve0_bo_move_copy, nve0_bo_move_init }, { "GRCE", 0, 0xc5b5, nve0_bo_move_copy, nvc0_bo_move_init }, { "COPY", 4, 0xc3b5, nve0_bo_move_copy, nve0_bo_move_init }, diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index 80099ef75702..ea7769135b0d 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -250,7 +250,8 @@ static int nouveau_channel_ind(struct nouveau_drm *drm, struct nvif_device *device, u64 runlist, bool priv, struct nouveau_channel **pchan) { - static const u16 oclasses[] = { TURING_CHANNEL_GPFIFO_A, + static const u16 oclasses[] = { AMPERE_CHANNEL_GPFIFO_B, + TURING_CHANNEL_GPFIFO_A, VOLTA_CHANNEL_GPFIFO_A, PASCAL_CHANNEL_GPFIFO_A, MAXWELL_CHANNEL_GPFIFO_A, @@ -386,7 +387,8 @@ nouveau_channel_init(struct nouveau_channel *chan, u32 vram, u32 gart) nvif_object_map(&chan->user, NULL, 0); - if (chan->user.oclass >= FERMI_CHANNEL_GPFIFO) { + if (chan->user.oclass >= FERMI_CHANNEL_GPFIFO && + chan->user.oclass < AMPERE_CHANNEL_GPFIFO_B) { ret = nvif_notify_ctor(&chan->user, "abi16ChanKilled", nouveau_channel_killed, true, NV906F_V0_NTFY_KILLED, diff --git a/drivers/gpu/drm/nouveau/nouveau_debugfs.c b/drivers/gpu/drm/nouveau/nouveau_debugfs.c index c2bc05eb2e54..1cbe01048b93 100644 --- a/drivers/gpu/drm/nouveau/nouveau_debugfs.c +++ b/drivers/gpu/drm/nouveau/nouveau_debugfs.c @@ -207,6 +207,7 @@ static const struct file_operations nouveau_pstate_fops = { .open = nouveau_debugfs_pstate_open, .read = seq_read, .write = nouveau_debugfs_pstate_set, + .release = single_release, }; static struct drm_info_list nouveau_debugfs_list[] = { diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 1f828c9f691c..6109cd9e3399 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -345,6 +345,9 @@ nouveau_accel_gr_init(struct nouveau_drm *drm) u32 arg0, arg1; int ret; + if (device->info.family >= NV_DEVICE_INFO_V0_AMPERE) + return; + /* Allocate channel that has access to the graphics engine. */ if (device->info.family >= NV_DEVICE_INFO_V0_KEPLER) { arg0 = nvif_fifo_runlist(device, NV_DEVICE_HOST_RUNLIST_ENGINES_GR); @@ -469,6 +472,7 @@ nouveau_accel_init(struct nouveau_drm *drm) case PASCAL_CHANNEL_GPFIFO_A: case VOLTA_CHANNEL_GPFIFO_A: case TURING_CHANNEL_GPFIFO_A: + case AMPERE_CHANNEL_GPFIFO_B: ret = nvc0_fence_create(drm); break; default: diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 5b27845075a1..8c2ecc282723 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -247,10 +247,8 @@ nouveau_gem_new(struct nouveau_cli *cli, u64 size, int align, uint32_t domain, } ret = nouveau_bo_init(nvbo, size, align, domain, NULL, NULL); - if (ret) { - nouveau_bo_ref(NULL, &nvbo); + if (ret) return ret; - } /* we restrict allowed domains on nv50+ to only the types * that were requested at creation time. not possibly on diff --git a/drivers/gpu/drm/nouveau/nv84_fence.c b/drivers/gpu/drm/nouveau/nv84_fence.c index 7c9c928c3196..c3526a8622e3 100644 --- a/drivers/gpu/drm/nouveau/nv84_fence.c +++ b/drivers/gpu/drm/nouveau/nv84_fence.c @@ -204,7 +204,7 @@ nv84_fence_create(struct nouveau_drm *drm) priv->base.context_new = nv84_fence_context_new; priv->base.context_del = nv84_fence_context_del; - priv->base.uevent = true; + priv->base.uevent = drm->client.device.info.family < NV_DEVICE_INFO_V0_AMPERE; mutex_init(&priv->mutex); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 93ddf63d1114..ca75c5f6ecaf 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2602,6 +2602,7 @@ nv172_chipset = { .top = { 0x00000001, ga100_top_new }, .disp = { 0x00000001, ga102_disp_new }, .dma = { 0x00000001, gv100_dma_new }, + .fifo = { 0x00000001, ga102_fifo_new }, }; static const struct nvkm_device_chip @@ -2622,6 +2623,7 @@ nv174_chipset = { .top = { 0x00000001, ga100_top_new }, .disp = { 0x00000001, ga102_disp_new }, .dma = { 0x00000001, gv100_dma_new }, + .fifo = { 0x00000001, ga102_fifo_new }, }; static const struct nvkm_device_chip @@ -2642,6 +2644,7 @@ nv177_chipset = { .top = { 0x00000001, ga100_top_new }, .disp = { 0x00000001, ga102_disp_new }, .dma = { 0x00000001, gv100_dma_new }, + .fifo = { 0x00000001, ga102_fifo_new }, }; static int diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild index 3209eb7af65f..5e831d347a95 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild @@ -18,6 +18,7 @@ nvkm-y += nvkm/engine/fifo/gp100.o nvkm-y += nvkm/engine/fifo/gp10b.o nvkm-y += nvkm/engine/fifo/gv100.o nvkm-y += nvkm/engine/fifo/tu102.o +nvkm-y += nvkm/engine/fifo/ga102.o nvkm-y += nvkm/engine/fifo/chan.o nvkm-y += nvkm/engine/fifo/channv50.o diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c index 353b77d9b3dc..3492c561f2cf 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c @@ -82,7 +82,7 @@ g84_fifo_chan_engine_fini(struct nvkm_fifo_chan *base, if (offset < 0) return 0; - engn = fifo->base.func->engine_id(&fifo->base, engine); + engn = fifo->base.func->engine_id(&fifo->base, engine) - 1; save = nvkm_mask(device, 0x002520, 0x0000003f, 1 << engn); nvkm_wr32(device, 0x0032fc, chan->base.inst->addr >> 12); done = nvkm_msec(device, 2000, diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c new file mode 100644 index 000000000000..c630dbd2911a --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c @@ -0,0 +1,311 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#define ga102_fifo(p) container_of((p), struct ga102_fifo, base.engine) +#define ga102_chan(p) container_of((p), struct ga102_chan, object) +#include <engine/fifo.h> +#include "user.h" + +#include <core/memory.h> +#include <subdev/mmu.h> +#include <subdev/timer.h> +#include <subdev/top.h> + +#include <nvif/cl0080.h> +#include <nvif/clc36f.h> +#include <nvif/class.h> + +struct ga102_fifo { + struct nvkm_fifo base; +}; + +struct ga102_chan { + struct nvkm_object object; + + struct { + u32 runl; + u32 chan; + } ctrl; + + struct nvkm_memory *mthd; + struct nvkm_memory *inst; + struct nvkm_memory *user; + struct nvkm_memory *runl; + + struct nvkm_vmm *vmm; +}; + +static int +ga102_chan_sclass(struct nvkm_object *object, int index, struct nvkm_oclass *oclass) +{ + if (index == 0) { + oclass->ctor = nvkm_object_new; + oclass->base = (struct nvkm_sclass) { -1, -1, AMPERE_DMA_COPY_B }; + return 0; + } + + return -EINVAL; +} + +static int +ga102_chan_map(struct nvkm_object *object, void *argv, u32 argc, + enum nvkm_object_map *type, u64 *addr, u64 *size) +{ + struct ga102_chan *chan = ga102_chan(object); + struct nvkm_device *device = chan->object.engine->subdev.device; + u64 bar2 = nvkm_memory_bar2(chan->user); + + if (bar2 == ~0ULL) + return -EFAULT; + + *type = NVKM_OBJECT_MAP_IO; + *addr = device->func->resource_addr(device, 3) + bar2; + *size = 0x1000; + return 0; +} + +static int +ga102_chan_fini(struct nvkm_object *object, bool suspend) +{ + struct ga102_chan *chan = ga102_chan(object); + struct nvkm_device *device = chan->object.engine->subdev.device; + + nvkm_wr32(device, chan->ctrl.chan, 0x00000003); + + nvkm_wr32(device, chan->ctrl.runl + 0x098, 0x01000000); + nvkm_msec(device, 2000, + if (!(nvkm_rd32(device, chan->ctrl.runl + 0x098) & 0x00100000)) + break; + ); + + nvkm_wr32(device, chan->ctrl.runl + 0x088, 0); + + nvkm_wr32(device, chan->ctrl.chan, 0xffffffff); + return 0; +} + +static int +ga102_chan_init(struct nvkm_object *object) +{ + struct ga102_chan *chan = ga102_chan(object); + struct nvkm_device *device = chan->object.engine->subdev.device; + + nvkm_mask(device, chan->ctrl.runl + 0x300, 0x80000000, 0x80000000); + + nvkm_wr32(device, chan->ctrl.runl + 0x080, lower_32_bits(nvkm_memory_addr(chan->runl))); + nvkm_wr32(device, chan->ctrl.runl + 0x084, upper_32_bits(nvkm_memory_addr(chan->runl))); + nvkm_wr32(device, chan->ctrl.runl + 0x088, 2); + + nvkm_wr32(device, chan->ctrl.chan, 0x00000002); + nvkm_wr32(device, chan->ctrl.runl + 0x0090, 0); + return 0; +} + +static void * +ga102_chan_dtor(struct nvkm_object *object) +{ + struct ga102_chan *chan = ga102_chan(object); + + if (chan->vmm) { + nvkm_vmm_part(chan->vmm, chan->inst); + nvkm_vmm_unref(&chan->vmm); + } + + nvkm_memory_unref(&chan->runl); + nvkm_memory_unref(&chan->user); + nvkm_memory_unref(&chan->inst); + nvkm_memory_unref(&chan->mthd); + return chan; +} + +static const struct nvkm_object_func +ga102_chan = { + .dtor = ga102_chan_dtor, + .init = ga102_chan_init, + .fini = ga102_chan_fini, + .map = ga102_chan_map, + .sclass = ga102_chan_sclass, +}; + +static int +ga102_chan_new(struct nvkm_device *device, + const struct nvkm_oclass *oclass, void *argv, u32 argc, struct nvkm_object **pobject) +{ + struct volta_channel_gpfifo_a_v0 *args = argv; + struct nvkm_top_device *tdev; + struct nvkm_vmm *vmm; + struct ga102_chan *chan; + int ret; + + if (argc != sizeof(*args)) + return -ENOSYS; + + vmm = nvkm_uvmm_search(oclass->client, args->vmm); + if (IS_ERR(vmm)) + return PTR_ERR(vmm); + + if (!(chan = kzalloc(sizeof(*chan), GFP_KERNEL))) + return -ENOMEM; + + nvkm_object_ctor(&ga102_chan, oclass, &chan->object); + *pobject = &chan->object; + + list_for_each_entry(tdev, &device->top->device, head) { + if (tdev->type == NVKM_ENGINE_CE) { + chan->ctrl.runl = tdev->runlist; + break; + } + } + + if (!chan->ctrl.runl) + return -ENODEV; + + chan->ctrl.chan = nvkm_rd32(device, chan->ctrl.runl + 0x004) & 0xfffffff0; + + args->chid = 0; + args->inst = 0; + args->token = nvkm_rd32(device, chan->ctrl.runl + 0x008) & 0xffff0000; + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->mthd); + if (ret) + return ret; + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->inst); + if (ret) + return ret; + + nvkm_kmap(chan->inst); + nvkm_wo32(chan->inst, 0x010, 0x0000face); + nvkm_wo32(chan->inst, 0x030, 0x7ffff902); + nvkm_wo32(chan->inst, 0x048, lower_32_bits(args->ioffset)); + nvkm_wo32(chan->inst, 0x04c, upper_32_bits(args->ioffset) | + (order_base_2(args->ilength / 8) << 16)); + nvkm_wo32(chan->inst, 0x084, 0x20400000); + nvkm_wo32(chan->inst, 0x094, 0x30000001); + nvkm_wo32(chan->inst, 0x0ac, 0x00020000); + nvkm_wo32(chan->inst, 0x0e4, 0x00000000); + nvkm_wo32(chan->inst, 0x0e8, 0); + nvkm_wo32(chan->inst, 0x0f4, 0x00001000); + nvkm_wo32(chan->inst, 0x0f8, 0x10003080); + nvkm_mo32(chan->inst, 0x218, 0x00000000, 0x00000000); + nvkm_wo32(chan->inst, 0x220, lower_32_bits(nvkm_memory_bar2(chan->mthd))); + nvkm_wo32(chan->inst, 0x224, upper_32_bits(nvkm_memory_bar2(chan->mthd))); + nvkm_done(chan->inst); + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->user); + if (ret) + return ret; + + ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0x1000, true, &chan->runl); + if (ret) + return ret; + + nvkm_kmap(chan->runl); + nvkm_wo32(chan->runl, 0x00, 0x80030001); + nvkm_wo32(chan->runl, 0x04, 1); + nvkm_wo32(chan->runl, 0x08, 0); + nvkm_wo32(chan->runl, 0x0c, 0x00000000); + nvkm_wo32(chan->runl, 0x10, lower_32_bits(nvkm_memory_addr(chan->user))); + nvkm_wo32(chan->runl, 0x14, upper_32_bits(nvkm_memory_addr(chan->user))); + nvkm_wo32(chan->runl, 0x18, lower_32_bits(nvkm_memory_addr(chan->inst))); + nvkm_wo32(chan->runl, 0x1c, upper_32_bits(nvkm_memory_addr(chan->inst))); + nvkm_done(chan->runl); + + ret = nvkm_vmm_join(vmm, chan->inst); + if (ret) + return ret; + + chan->vmm = nvkm_vmm_ref(vmm); + return 0; +} + +static const struct nvkm_device_oclass +ga102_chan_oclass = { + .ctor = ga102_chan_new, +}; + +static int +ga102_user_new(struct nvkm_device *device, + const struct nvkm_oclass *oclass, void *argv, u32 argc, struct nvkm_object **pobject) +{ + return tu102_fifo_user_new(oclass, argv, argc, pobject); +} + +static const struct nvkm_device_oclass +ga102_user_oclass = { + .ctor = ga102_user_new, +}; + +static int +ga102_fifo_sclass(struct nvkm_oclass *oclass, int index, const struct nvkm_device_oclass **class) +{ + if (index == 0) { + oclass->base = (struct nvkm_sclass) { -1, -1, VOLTA_USERMODE_A }; + *class = &ga102_user_oclass; + return 0; + } else + if (index == 1) { + oclass->base = (struct nvkm_sclass) { 0, 0, AMPERE_CHANNEL_GPFIFO_B }; + *class = &ga102_chan_oclass; + return 0; + } + + return 2; +} + +static int +ga102_fifo_info(struct nvkm_engine *engine, u64 mthd, u64 *data) +{ + switch (mthd) { + case NV_DEVICE_HOST_CHANNELS: *data = 1; return 0; + default: + break; + } + + return -ENOSYS; +} + +static void * +ga102_fifo_dtor(struct nvkm_engine *engine) +{ + return ga102_fifo(engine); +} + +static const struct nvkm_engine_func +ga102_fifo = { + .dtor = ga102_fifo_dtor, + .info = ga102_fifo_info, + .base.sclass = ga102_fifo_sclass, +}; + +int +ga102_fifo_new(struct nvkm_device *device, enum nvkm_subdev_type type, int inst, + struct nvkm_fifo **pfifo) +{ + struct ga102_fifo *fifo; + + if (!(fifo = kzalloc(sizeof(*fifo), GFP_KERNEL))) + return -ENOMEM; + + nvkm_engine_ctor(&ga102_fifo, device, type, inst, true, &fifo->base.engine); + *pfifo = &fifo->base; + return 0; +} diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c index 31933f3e5a07..c982d834c8d9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/top/ga100.c @@ -54,7 +54,7 @@ ga100_top_oneinit(struct nvkm_top *top) info->reset = (data & 0x0000001f); break; case 2: - info->runlist = (data & 0x0000fc00) >> 10; + info->runlist = (data & 0x00fffc00); info->engine = (data & 0x00000003); break; default: @@ -85,9 +85,10 @@ ga100_top_oneinit(struct nvkm_top *top) } nvkm_debug(subdev, "%02x.%d (%8s): addr %06x fault %2d " - "runlist %2d engine %2d reset %2d\n", type, inst, + "runlist %6x engine %2d reset %2d\n", type, inst, info->type == NVKM_SUBDEV_NR ? "????????" : nvkm_subdev_type[info->type], - info->addr, info->fault, info->runlist, info->engine, info->reset); + info->addr, info->fault, info->runlist < 0 ? 0 : info->runlist, + info->engine, info->reset); info = NULL; } diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index beb581b96ecd..418638e6e3b0 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -295,6 +295,7 @@ config DRM_PANEL_OLIMEX_LCD_OLINUXINO depends on OF depends on I2C depends on BACKLIGHT_CLASS_DEVICE + select CRC32 help The panel is used with different sizes LCDs, from 480x272 to 1280x800, and 24 bit per pixel. diff --git a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c index 2d8794d495d0..3d8a9ab47cae 100644 --- a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c +++ b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c @@ -146,8 +146,8 @@ static const struct reg_sequence y030xx067a_init_sequence[] = { { 0x09, REG09_SUB_BRIGHT_R(0x20) }, { 0x0a, REG0A_SUB_BRIGHT_B(0x20) }, { 0x0b, REG0B_HD_FREERUN | REG0B_VD_FREERUN }, - { 0x0c, REG0C_CONTRAST_R(0x10) }, - { 0x0d, REG0D_CONTRAST_G(0x10) }, + { 0x0c, REG0C_CONTRAST_R(0x00) }, + { 0x0d, REG0D_CONTRAST_G(0x00) }, { 0x0e, REG0E_CONTRAST_B(0x10) }, { 0x0f, 0 }, { 0x10, REG10_BRIGHT(0x7f) }, diff --git a/drivers/gpu/drm/panel/panel-ilitek-ili9881c.c b/drivers/gpu/drm/panel/panel-ilitek-ili9881c.c index 0145129d7c66..534dd7414d42 100644 --- a/drivers/gpu/drm/panel/panel-ilitek-ili9881c.c +++ b/drivers/gpu/drm/panel/panel-ilitek-ili9881c.c @@ -590,14 +590,14 @@ static const struct drm_display_mode k101_im2byl02_default_mode = { .clock = 69700, .hdisplay = 800, - .hsync_start = 800 + 6, - .hsync_end = 800 + 6 + 15, - .htotal = 800 + 6 + 15 + 16, + .hsync_start = 800 + 52, + .hsync_end = 800 + 52 + 8, + .htotal = 800 + 52 + 8 + 48, .vdisplay = 1280, - .vsync_start = 1280 + 8, - .vsync_end = 1280 + 8 + 48, - .vtotal = 1280 + 8 + 48 + 52, + .vsync_start = 1280 + 16, + .vsync_end = 1280 + 16 + 6, + .vtotal = 1280 + 16 + 6 + 15, .width_mm = 135, .height_mm = 217, diff --git a/drivers/gpu/drm/r128/ati_pcigart.c b/drivers/gpu/drm/r128/ati_pcigart.c index 0ecccf25a3c7..d2a0f5394fef 100644 --- a/drivers/gpu/drm/r128/ati_pcigart.c +++ b/drivers/gpu/drm/r128/ati_pcigart.c @@ -214,7 +214,7 @@ int drm_ati_pcigart_init(struct drm_device *dev, struct drm_ati_pcigart_info *ga } ret = 0; -#if defined(__i386__) || defined(__x86_64__) +#ifdef CONFIG_X86 wbinvd(); #else mb(); diff --git a/drivers/gpu/drm/rcar-du/rcar_du_encoder.c b/drivers/gpu/drm/rcar-du/rcar_du_encoder.c index 0daa8bba50f5..4bf4e25d7f01 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_encoder.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_encoder.c @@ -86,12 +86,20 @@ int rcar_du_encoder_init(struct rcar_du_device *rcdu, } /* - * Create and initialize the encoder. On Gen3 skip the LVDS1 output if + * Create and initialize the encoder. On Gen3, skip the LVDS1 output if * the LVDS1 encoder is used as a companion for LVDS0 in dual-link - * mode. + * mode, or any LVDS output if it isn't connected. The latter may happen + * on D3 or E3 as the LVDS encoders are needed to provide the pixel + * clock to the DU, even when the LVDS outputs are not used. */ - if (rcdu->info->gen >= 3 && output == RCAR_DU_OUTPUT_LVDS1) { - if (rcar_lvds_dual_link(bridge)) + if (rcdu->info->gen >= 3) { + if (output == RCAR_DU_OUTPUT_LVDS1 && + rcar_lvds_dual_link(bridge)) + return -ENOLINK; + + if ((output == RCAR_DU_OUTPUT_LVDS0 || + output == RCAR_DU_OUTPUT_LVDS1) && + !rcar_lvds_is_connected(bridge)) return -ENOLINK; } diff --git a/drivers/gpu/drm/rcar-du/rcar_lvds.c b/drivers/gpu/drm/rcar-du/rcar_lvds.c index d061b8de748f..b672c5bd72ee 100644 --- a/drivers/gpu/drm/rcar-du/rcar_lvds.c +++ b/drivers/gpu/drm/rcar-du/rcar_lvds.c @@ -576,6 +576,9 @@ static int rcar_lvds_attach(struct drm_bridge *bridge, { struct rcar_lvds *lvds = bridge_to_rcar_lvds(bridge); + if (!lvds->next_bridge) + return 0; + return drm_bridge_attach(bridge->encoder, lvds->next_bridge, bridge, flags); } @@ -598,6 +601,14 @@ bool rcar_lvds_dual_link(struct drm_bridge *bridge) } EXPORT_SYMBOL_GPL(rcar_lvds_dual_link); +bool rcar_lvds_is_connected(struct drm_bridge *bridge) +{ + struct rcar_lvds *lvds = bridge_to_rcar_lvds(bridge); + + return lvds->next_bridge != NULL; +} +EXPORT_SYMBOL_GPL(rcar_lvds_is_connected); + /* ----------------------------------------------------------------------------- * Probe & Remove */ diff --git a/drivers/gpu/drm/rcar-du/rcar_lvds.h b/drivers/gpu/drm/rcar-du/rcar_lvds.h index 222ec0e60785..eb7c6ef03b00 100644 --- a/drivers/gpu/drm/rcar-du/rcar_lvds.h +++ b/drivers/gpu/drm/rcar-du/rcar_lvds.h @@ -16,6 +16,7 @@ struct drm_bridge; int rcar_lvds_clk_enable(struct drm_bridge *bridge, unsigned long freq); void rcar_lvds_clk_disable(struct drm_bridge *bridge); bool rcar_lvds_dual_link(struct drm_bridge *bridge); +bool rcar_lvds_is_connected(struct drm_bridge *bridge); #else static inline int rcar_lvds_clk_enable(struct drm_bridge *bridge, unsigned long freq) @@ -27,6 +28,10 @@ static inline bool rcar_lvds_dual_link(struct drm_bridge *bridge) { return false; } +static inline bool rcar_lvds_is_connected(struct drm_bridge *bridge) +{ + return false; +} #endif /* CONFIG_DRM_RCAR_LVDS */ #endif /* __RCAR_LVDS_H__ */ diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c index ba9e14da41b4..a25b98b7f5bd 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c @@ -1174,26 +1174,24 @@ static bool vop_crtc_mode_fixup(struct drm_crtc *crtc, * * Action plan: * - * 1. When DRM gives us a mode, we should add 999 Hz to it. That way - * if the clock we need is 60000001 Hz (~60 MHz) and DRM tells us to - * make 60000 kHz then the clock framework will actually give us - * the right clock. + * 1. Try to set the exact rate first, and confirm the clock framework + * can provide it. * - * NOTE: if the PLL (maybe through a divider) could actually make - * a clock rate 999 Hz higher instead of the one we want then this - * could be a problem. Unfortunately there's not much we can do - * since it's baked into DRM to use kHz. It shouldn't matter in - * practice since Rockchip PLLs are controlled by tables and - * even if there is a divider in the middle I wouldn't expect PLL - * rates in the table that are just a few kHz different. + * 2. If the clock framework cannot provide the exact rate, we should + * add 999 Hz to the requested rate. That way if the clock we need + * is 60000001 Hz (~60 MHz) and DRM tells us to make 60000 kHz then + * the clock framework will actually give us the right clock. * - * 2. Get the clock framework to round the rate for us to tell us + * 3. Get the clock framework to round the rate for us to tell us * what it will actually make. * - * 3. Store the rounded up rate so that we don't need to worry about + * 4. Store the rounded up rate so that we don't need to worry about * this in the actual clk_set_rate(). */ - rate = clk_round_rate(vop->dclk, adjusted_mode->clock * 1000 + 999); + rate = clk_round_rate(vop->dclk, adjusted_mode->clock * 1000); + if (rate / 1000 != adjusted_mode->clock) + rate = clk_round_rate(vop->dclk, + adjusted_mode->clock * 1000 + 999); adjusted_mode->clock = DIV_ROUND_UP(rate, 1000); return true; diff --git a/drivers/gpu/drm/selftests/test-drm_damage_helper.c b/drivers/gpu/drm/selftests/test-drm_damage_helper.c index 1c19a5d3eefb..8d8d8e214c28 100644 --- a/drivers/gpu/drm/selftests/test-drm_damage_helper.c +++ b/drivers/gpu/drm/selftests/test-drm_damage_helper.c @@ -30,6 +30,7 @@ static void mock_setup(struct drm_plane_state *state) mock_device.driver = &mock_driver; mock_device.mode_config.prop_fb_damage_clips = &mock_prop; mock_plane.dev = &mock_device; + mock_obj_props.count = 0; mock_plane.base.properties = &mock_obj_props; mock_prop.base.id = 1; /* 0 is an invalid id */ mock_prop.dev = &mock_device; diff --git a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c index f75fb157f2ff..016b877051da 100644 --- a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c +++ b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.c @@ -216,11 +216,13 @@ static int sun8i_dw_hdmi_bind(struct device *dev, struct device *master, goto err_disable_clk_tmds; } + ret = sun8i_hdmi_phy_init(hdmi->phy); + if (ret) + goto err_disable_clk_tmds; + drm_encoder_helper_add(encoder, &sun8i_dw_hdmi_encoder_helper_funcs); drm_simple_encoder_init(drm, encoder, DRM_MODE_ENCODER_TMDS); - sun8i_hdmi_phy_init(hdmi->phy); - plat_data->mode_valid = hdmi->quirks->mode_valid; plat_data->use_drm_infoframe = hdmi->quirks->use_drm_infoframe; sun8i_hdmi_phy_set_ops(hdmi->phy, plat_data); @@ -262,6 +264,7 @@ static void sun8i_dw_hdmi_unbind(struct device *dev, struct device *master, struct sun8i_dw_hdmi *hdmi = dev_get_drvdata(dev); dw_hdmi_unbind(hdmi->hdmi); + sun8i_hdmi_phy_deinit(hdmi->phy); clk_disable_unprepare(hdmi->clk_tmds); reset_control_assert(hdmi->rst_ctrl); gpiod_set_value(hdmi->ddc_en, 0); diff --git a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h index 74f6ed0e2570..bffe1b9cd3dc 100644 --- a/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h +++ b/drivers/gpu/drm/sun4i/sun8i_dw_hdmi.h @@ -169,6 +169,7 @@ struct sun8i_hdmi_phy { struct clk *clk_phy; struct clk *clk_pll0; struct clk *clk_pll1; + struct device *dev; unsigned int rcal; struct regmap *regs; struct reset_control *rst_phy; @@ -205,7 +206,8 @@ encoder_to_sun8i_dw_hdmi(struct drm_encoder *encoder) int sun8i_hdmi_phy_get(struct sun8i_dw_hdmi *hdmi, struct device_node *node); -void sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy); +int sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy); +void sun8i_hdmi_phy_deinit(struct sun8i_hdmi_phy *phy); void sun8i_hdmi_phy_set_ops(struct sun8i_hdmi_phy *phy, struct dw_hdmi_plat_data *plat_data); diff --git a/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c b/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c index c9239708d398..b64d93da651d 100644 --- a/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c +++ b/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c @@ -506,9 +506,60 @@ static void sun8i_hdmi_phy_init_h3(struct sun8i_hdmi_phy *phy) phy->rcal = (val & SUN8I_HDMI_PHY_ANA_STS_RCAL_MASK) >> 2; } -void sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy) +int sun8i_hdmi_phy_init(struct sun8i_hdmi_phy *phy) { + int ret; + + ret = reset_control_deassert(phy->rst_phy); + if (ret) { + dev_err(phy->dev, "Cannot deassert phy reset control: %d\n", ret); + return ret; + } + + ret = clk_prepare_enable(phy->clk_bus); + if (ret) { + dev_err(phy->dev, "Cannot enable bus clock: %d\n", ret); + goto err_assert_rst_phy; + } + + ret = clk_prepare_enable(phy->clk_mod); + if (ret) { + dev_err(phy->dev, "Cannot enable mod clock: %d\n", ret); + goto err_disable_clk_bus; + } + + if (phy->variant->has_phy_clk) { + ret = sun8i_phy_clk_create(phy, phy->dev, + phy->variant->has_second_pll); + if (ret) { + dev_err(phy->dev, "Couldn't create the PHY clock\n"); + goto err_disable_clk_mod; + } + + clk_prepare_enable(phy->clk_phy); + } + phy->variant->phy_init(phy); + + return 0; + +err_disable_clk_mod: + clk_disable_unprepare(phy->clk_mod); +err_disable_clk_bus: + clk_disable_unprepare(phy->clk_bus); +err_assert_rst_phy: + reset_control_assert(phy->rst_phy); + + return ret; +} + +void sun8i_hdmi_phy_deinit(struct sun8i_hdmi_phy *phy) +{ + clk_disable_unprepare(phy->clk_mod); + clk_disable_unprepare(phy->clk_bus); + clk_disable_unprepare(phy->clk_phy); + + reset_control_assert(phy->rst_phy); } void sun8i_hdmi_phy_set_ops(struct sun8i_hdmi_phy *phy, @@ -638,6 +689,7 @@ static int sun8i_hdmi_phy_probe(struct platform_device *pdev) return -ENOMEM; phy->variant = (struct sun8i_hdmi_phy_variant *)match->data; + phy->dev = dev; ret = of_address_to_resource(node, 0, &res); if (ret) { @@ -696,47 +748,10 @@ static int sun8i_hdmi_phy_probe(struct platform_device *pdev) goto err_put_clk_pll1; } - ret = reset_control_deassert(phy->rst_phy); - if (ret) { - dev_err(dev, "Cannot deassert phy reset control: %d\n", ret); - goto err_put_rst_phy; - } - - ret = clk_prepare_enable(phy->clk_bus); - if (ret) { - dev_err(dev, "Cannot enable bus clock: %d\n", ret); - goto err_deassert_rst_phy; - } - - ret = clk_prepare_enable(phy->clk_mod); - if (ret) { - dev_err(dev, "Cannot enable mod clock: %d\n", ret); - goto err_disable_clk_bus; - } - - if (phy->variant->has_phy_clk) { - ret = sun8i_phy_clk_create(phy, dev, - phy->variant->has_second_pll); - if (ret) { - dev_err(dev, "Couldn't create the PHY clock\n"); - goto err_disable_clk_mod; - } - - clk_prepare_enable(phy->clk_phy); - } - platform_set_drvdata(pdev, phy); return 0; -err_disable_clk_mod: - clk_disable_unprepare(phy->clk_mod); -err_disable_clk_bus: - clk_disable_unprepare(phy->clk_bus); -err_deassert_rst_phy: - reset_control_assert(phy->rst_phy); -err_put_rst_phy: - reset_control_put(phy->rst_phy); err_put_clk_pll1: clk_put(phy->clk_pll1); err_put_clk_pll0: @@ -753,12 +768,6 @@ static int sun8i_hdmi_phy_remove(struct platform_device *pdev) { struct sun8i_hdmi_phy *phy = platform_get_drvdata(pdev); - clk_disable_unprepare(phy->clk_mod); - clk_disable_unprepare(phy->clk_bus); - clk_disable_unprepare(phy->clk_phy); - - reset_control_assert(phy->rst_phy); - reset_control_put(phy->rst_phy); clk_put(phy->clk_pll0); diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 1c5ffe2935af..abf2d7a4fdf1 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -190,6 +190,7 @@ static void ttm_transfered_destroy(struct ttm_buffer_object *bo) struct ttm_transfer_obj *fbo; fbo = container_of(bo, struct ttm_transfer_obj, base); + dma_resv_fini(&fbo->base.base._resv); ttm_bo_put(fbo->bo); kfree(fbo); } diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index b4b4653fe301..ed8a4b7f8b6e 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -1395,14 +1395,6 @@ static int vc4_hdmi_audio_prepare(struct device *dev, void *data, return 0; } -static const struct snd_soc_dapm_widget vc4_hdmi_audio_widgets[] = { - SND_SOC_DAPM_OUTPUT("TX"), -}; - -static const struct snd_soc_dapm_route vc4_hdmi_audio_routes[] = { - { "TX", NULL, "Playback" }, -}; - static const struct snd_soc_component_driver vc4_hdmi_audio_cpu_dai_comp = { .name = "vc4-hdmi-cpu-dai-component", }; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 42f3d9d123a1..d030577ad6a2 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -13,6 +13,7 @@ #define _HYPERV_VMBUS_H #include <linux/list.h> +#include <linux/bitops.h> #include <asm/sync_bitops.h> #include <asm/hyperv-tlfs.h> #include <linux/atomic.h> diff --git a/drivers/i2c/busses/i2c-mlxcpld.c b/drivers/i2c/busses/i2c-mlxcpld.c index 4e0b7c2882ce..015e11c4663f 100644 --- a/drivers/i2c/busses/i2c-mlxcpld.c +++ b/drivers/i2c/busses/i2c-mlxcpld.c @@ -49,7 +49,7 @@ #define MLXCPLD_LPCI2C_NACK_IND 2 #define MLXCPLD_I2C_FREQ_1000KHZ_SET 0x04 -#define MLXCPLD_I2C_FREQ_400KHZ_SET 0x0f +#define MLXCPLD_I2C_FREQ_400KHZ_SET 0x0c #define MLXCPLD_I2C_FREQ_100KHZ_SET 0x42 enum mlxcpld_i2c_frequency { @@ -495,7 +495,7 @@ mlxcpld_i2c_set_frequency(struct mlxcpld_i2c_priv *priv, return err; /* Set frequency only if it is not 100KHz, which is default. */ - switch ((data->reg & data->mask) >> data->bit) { + switch ((regval & data->mask) >> data->bit) { case MLXCPLD_I2C_FREQ_1000KHZ: freq = MLXCPLD_I2C_FREQ_1000KHZ_SET; break; diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index 477480d1de6b..7d4b3eb7077a 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -41,6 +41,8 @@ #define I2C_HANDSHAKE_RST 0x0020 #define I2C_FIFO_ADDR_CLR 0x0001 #define I2C_DELAY_LEN 0x0002 +#define I2C_ST_START_CON 0x8001 +#define I2C_FS_START_CON 0x1800 #define I2C_TIME_CLR_VALUE 0x0000 #define I2C_TIME_DEFAULT_VALUE 0x0003 #define I2C_WRRD_TRANAC_VALUE 0x0002 @@ -480,6 +482,7 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) { u16 control_reg; u16 intr_stat_reg; + u16 ext_conf_val; mtk_i2c_writew(i2c, I2C_CHN_CLR_FLAG, OFFSET_START); intr_stat_reg = mtk_i2c_readw(i2c, OFFSET_INTR_STAT); @@ -518,8 +521,13 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) if (i2c->dev_comp->ltiming_adjust) mtk_i2c_writew(i2c, i2c->ltiming_reg, OFFSET_LTIMING); + if (i2c->speed_hz <= I2C_MAX_STANDARD_MODE_FREQ) + ext_conf_val = I2C_ST_START_CON; + else + ext_conf_val = I2C_FS_START_CON; + if (i2c->dev_comp->timing_adjust) { - mtk_i2c_writew(i2c, i2c->ac_timing.ext, OFFSET_EXT_CONF); + ext_conf_val = i2c->ac_timing.ext; mtk_i2c_writew(i2c, i2c->ac_timing.inter_clk_div, OFFSET_CLOCK_DIV); mtk_i2c_writew(i2c, I2C_SCL_MIS_COMP_VALUE, @@ -544,6 +552,7 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) OFFSET_HS_STA_STO_AC_TIMING); } } + mtk_i2c_writew(i2c, ext_conf_val, OFFSET_EXT_CONF); /* If use i2c pin from PMIC mt6397 side, need set PATH_DIR first */ if (i2c->have_pmic) diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index aaeeacc12121..546cc935e035 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -454,6 +454,7 @@ static int i2c_acpi_notify(struct notifier_block *nb, unsigned long value, break; i2c_acpi_register_device(adapter, adev, &info); + put_device(&adapter->dev); break; case ACPI_RECONFIG_DEVICE_REMOVE: if (!acpi_device_enumerated(adev)) diff --git a/drivers/iio/accel/fxls8962af-core.c b/drivers/iio/accel/fxls8962af-core.c index 0019f1ea7df2..f41db9e0249a 100644 --- a/drivers/iio/accel/fxls8962af-core.c +++ b/drivers/iio/accel/fxls8962af-core.c @@ -738,7 +738,7 @@ static irqreturn_t fxls8962af_interrupt(int irq, void *p) if (reg & FXLS8962AF_INT_STATUS_SRC_BUF) { ret = fxls8962af_fifo_flush(indio_dev); - if (ret) + if (ret < 0) return IRQ_NONE; return IRQ_HANDLED; diff --git a/drivers/iio/adc/ad7192.c b/drivers/iio/adc/ad7192.c index ee8ed9481025..2121a812b0c3 100644 --- a/drivers/iio/adc/ad7192.c +++ b/drivers/iio/adc/ad7192.c @@ -293,6 +293,7 @@ static const struct ad_sigma_delta_info ad7192_sigma_delta_info = { .has_registers = true, .addr_shift = 3, .read_mask = BIT(6), + .irq_flags = IRQF_TRIGGER_FALLING, }; static const struct ad_sd_calib_data ad7192_calib_arr[8] = { diff --git a/drivers/iio/adc/ad7780.c b/drivers/iio/adc/ad7780.c index 42bb952f4738..b6e8c8abf6f4 100644 --- a/drivers/iio/adc/ad7780.c +++ b/drivers/iio/adc/ad7780.c @@ -203,7 +203,7 @@ static const struct ad_sigma_delta_info ad7780_sigma_delta_info = { .set_mode = ad7780_set_mode, .postprocess_sample = ad7780_postprocess_sample, .has_registers = false, - .irq_flags = IRQF_TRIGGER_LOW, + .irq_flags = IRQF_TRIGGER_FALLING, }; #define _AD7780_CHANNEL(_bits, _wordsize, _mask_all) \ diff --git a/drivers/iio/adc/ad7793.c b/drivers/iio/adc/ad7793.c index ef3e2d3ecb0c..0e7ab3fb072a 100644 --- a/drivers/iio/adc/ad7793.c +++ b/drivers/iio/adc/ad7793.c @@ -206,7 +206,7 @@ static const struct ad_sigma_delta_info ad7793_sigma_delta_info = { .has_registers = true, .addr_shift = 3, .read_mask = BIT(6), - .irq_flags = IRQF_TRIGGER_LOW, + .irq_flags = IRQF_TRIGGER_FALLING, }; static const struct ad_sd_calib_data ad7793_calib_arr[6] = { diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c index 19efaa41bc34..34ec0c28b2df 100644 --- a/drivers/iio/adc/aspeed_adc.c +++ b/drivers/iio/adc/aspeed_adc.c @@ -183,6 +183,7 @@ static int aspeed_adc_probe(struct platform_device *pdev) data = iio_priv(indio_dev); data->dev = &pdev->dev; + platform_set_drvdata(pdev, indio_dev); data->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(data->base)) diff --git a/drivers/iio/adc/max1027.c b/drivers/iio/adc/max1027.c index 655ab02d03d8..b753658bb41e 100644 --- a/drivers/iio/adc/max1027.c +++ b/drivers/iio/adc/max1027.c @@ -103,7 +103,7 @@ MODULE_DEVICE_TABLE(of, max1027_adc_dt_ids); .sign = 'u', \ .realbits = depth, \ .storagebits = 16, \ - .shift = 2, \ + .shift = (depth == 10) ? 2 : 0, \ .endianness = IIO_BE, \ }, \ } @@ -142,7 +142,6 @@ MODULE_DEVICE_TABLE(of, max1027_adc_dt_ids); MAX1027_V_CHAN(11, depth) #define MAX1X31_CHANNELS(depth) \ - MAX1X27_CHANNELS(depth), \ MAX1X29_CHANNELS(depth), \ MAX1027_V_CHAN(12, depth), \ MAX1027_V_CHAN(13, depth), \ diff --git a/drivers/iio/adc/mt6577_auxadc.c b/drivers/iio/adc/mt6577_auxadc.c index 79c1dd68b909..d4fccd52ef08 100644 --- a/drivers/iio/adc/mt6577_auxadc.c +++ b/drivers/iio/adc/mt6577_auxadc.c @@ -82,6 +82,10 @@ static const struct iio_chan_spec mt6577_auxadc_iio_channels[] = { MT6577_AUXADC_CHANNEL(15), }; +/* For Voltage calculation */ +#define VOLTAGE_FULL_RANGE 1500 /* VA voltage */ +#define AUXADC_PRECISE 4096 /* 12 bits */ + static int mt_auxadc_get_cali_data(int rawdata, bool enable_cali) { return rawdata; @@ -191,6 +195,10 @@ static int mt6577_auxadc_read_raw(struct iio_dev *indio_dev, } if (adc_dev->dev_comp->sample_data_cali) *val = mt_auxadc_get_cali_data(*val, true); + + /* Convert adc raw data to voltage: 0 - 1500 mV */ + *val = *val * VOLTAGE_FULL_RANGE / AUXADC_PRECISE; + return IIO_VAL_INT; default: diff --git a/drivers/iio/adc/rzg2l_adc.c b/drivers/iio/adc/rzg2l_adc.c index 9996d5eef289..32fbf57c362f 100644 --- a/drivers/iio/adc/rzg2l_adc.c +++ b/drivers/iio/adc/rzg2l_adc.c @@ -401,7 +401,7 @@ static int rzg2l_adc_hw_init(struct rzg2l_adc *adc) exit_hw_init: clk_disable_unprepare(adc->pclk); - return 0; + return ret; } static void rzg2l_adc_pm_runtime_disable(void *data) @@ -570,8 +570,10 @@ static int __maybe_unused rzg2l_adc_pm_runtime_resume(struct device *dev) return ret; ret = clk_prepare_enable(adc->adclk); - if (ret) + if (ret) { + clk_disable_unprepare(adc->pclk); return ret; + } rzg2l_adc_pwr(adc, true); diff --git a/drivers/iio/adc/ti-adc128s052.c b/drivers/iio/adc/ti-adc128s052.c index 3143f35a6509..83c1ae07b3e9 100644 --- a/drivers/iio/adc/ti-adc128s052.c +++ b/drivers/iio/adc/ti-adc128s052.c @@ -171,7 +171,13 @@ static int adc128_probe(struct spi_device *spi) mutex_init(&adc->lock); ret = iio_device_register(indio_dev); + if (ret) + goto err_disable_regulator; + return 0; + +err_disable_regulator: + regulator_disable(adc->reg); return ret; } diff --git a/drivers/iio/common/ssp_sensors/ssp_spi.c b/drivers/iio/common/ssp_sensors/ssp_spi.c index 4864c38b8d1c..769bd9280524 100644 --- a/drivers/iio/common/ssp_sensors/ssp_spi.c +++ b/drivers/iio/common/ssp_sensors/ssp_spi.c @@ -137,7 +137,7 @@ static int ssp_print_mcu_debug(char *data_frame, int *data_index, if (length > received_len - *data_index || length <= 0) { ssp_dbg("[SSP]: MSG From MCU-invalid debug length(%d/%d)\n", length, received_len); - return length ? length : -EPROTO; + return -EPROTO; } ssp_dbg("[SSP]: MSG From MCU - %s\n", &data_frame[*data_index]); @@ -273,6 +273,8 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) for (idx = 0; idx < len;) { switch (dataframe[idx++]) { case SSP_MSG2AP_INST_BYPASS_DATA: + if (idx >= len) + return -EPROTO; sd = dataframe[idx++]; if (sd < 0 || sd >= SSP_SENSOR_MAX) { dev_err(SSP_DEV, @@ -282,10 +284,13 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) if (indio_devs[sd]) { spd = iio_priv(indio_devs[sd]); - if (spd->process_data) + if (spd->process_data) { + if (idx >= len) + return -EPROTO; spd->process_data(indio_devs[sd], &dataframe[idx], data->timestamp); + } } else { dev_err(SSP_DEV, "no client for frame\n"); } @@ -293,6 +298,8 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) idx += ssp_offset_map[sd]; break; case SSP_MSG2AP_INST_DEBUG_DATA: + if (idx >= len) + return -EPROTO; sd = ssp_print_mcu_debug(dataframe, &idx, len); if (sd) { dev_err(SSP_DEV, diff --git a/drivers/iio/dac/ti-dac5571.c b/drivers/iio/dac/ti-dac5571.c index 2a5ba1b08a1d..546a4cf6c5ef 100644 --- a/drivers/iio/dac/ti-dac5571.c +++ b/drivers/iio/dac/ti-dac5571.c @@ -350,6 +350,7 @@ static int dac5571_probe(struct i2c_client *client, data->dac5571_pwrdwn = dac5571_pwrdwn_quad; break; default: + ret = -EINVAL; goto err; } diff --git a/drivers/iio/imu/adis16475.c b/drivers/iio/imu/adis16475.c index eb48102f9424..287fff39a927 100644 --- a/drivers/iio/imu/adis16475.c +++ b/drivers/iio/imu/adis16475.c @@ -353,10 +353,11 @@ static int adis16475_set_freq(struct adis16475 *st, const u32 freq) if (dec > st->info->max_dec) dec = st->info->max_dec; - ret = adis_write_reg_16(&st->adis, ADIS16475_REG_DEC_RATE, dec); + ret = __adis_write_reg_16(&st->adis, ADIS16475_REG_DEC_RATE, dec); if (ret) goto error; + adis_dev_unlock(&st->adis); /* * If decimation is used, then gyro and accel data will have meaningful * bits on the LSB registers. This info is used on the trigger handler. diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c index a869a6e52a16..ed129321a14d 100644 --- a/drivers/iio/imu/adis16480.c +++ b/drivers/iio/imu/adis16480.c @@ -144,6 +144,7 @@ struct adis16480_chip_info { unsigned int max_dec_rate; const unsigned int *filter_freqs; bool has_pps_clk_mode; + bool has_sleep_cnt; const struct adis_data adis_data; }; @@ -939,6 +940,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16375, &adis16485_timeouts, 0), }, @@ -952,6 +954,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16480, &adis16480_timeouts, 0), }, @@ -965,6 +968,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16485, &adis16485_timeouts, 0), }, @@ -978,6 +982,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .temp_scale = 5650, /* 5.65 milli degree Celsius */ .int_clk = 2460000, .max_dec_rate = 2048, + .has_sleep_cnt = true, .filter_freqs = adis16480_def_filter_freqs, .adis_data = ADIS16480_DATA(16488, &adis16485_timeouts, 0), }, @@ -1425,9 +1430,12 @@ static int adis16480_probe(struct spi_device *spi) if (ret) return ret; - ret = devm_add_action_or_reset(&spi->dev, adis16480_stop, indio_dev); - if (ret) - return ret; + if (st->chip_info->has_sleep_cnt) { + ret = devm_add_action_or_reset(&spi->dev, adis16480_stop, + indio_dev); + if (ret) + return ret; + } ret = adis16480_config_irq_pin(spi->dev.of_node, st); if (ret) diff --git a/drivers/iio/light/opt3001.c b/drivers/iio/light/opt3001.c index 52963da401a7..1880bd5bb258 100644 --- a/drivers/iio/light/opt3001.c +++ b/drivers/iio/light/opt3001.c @@ -276,6 +276,8 @@ static int opt3001_get_lux(struct opt3001 *opt, int *val, int *val2) ret = wait_event_timeout(opt->result_ready_queue, opt->result_ready, msecs_to_jiffies(OPT3001_RESULT_READY_LONG)); + if (ret == 0) + return -ETIMEDOUT; } else { /* Sleep for result ready time */ timeout = (opt->int_time == OPT3001_INT_TIME_SHORT) ? @@ -312,9 +314,7 @@ err: /* Disallow IRQ to access the device while lock is active */ opt->ok_to_ignore_lock = false; - if (ret == 0) - return -ETIMEDOUT; - else if (ret < 0) + if (ret < 0) return ret; if (opt->use_irq) { diff --git a/drivers/iio/test/Makefile b/drivers/iio/test/Makefile index f1099b495301..467519a2027e 100644 --- a/drivers/iio/test/Makefile +++ b/drivers/iio/test/Makefile @@ -5,3 +5,4 @@ # Keep in alphabetical order obj-$(CONFIG_IIO_TEST_FORMAT) += iio-test-format.o +CFLAGS_iio-test-format.o += $(DISABLE_STRUCTLEAK_PLUGIN) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a20b8108e160..c00f8e28aab7 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -706,8 +706,9 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, dev_name(&query->port->agent->device->dev), - LS_DEVICE_NAME_MAX); + strscpy_pad(header->device_name, + dev_name(&query->port->agent->device->dev), + LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 489b436f19bb..3d42bd2b36bd 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -878,6 +878,7 @@ void sc_disable(struct send_context *sc) { u64 reg; struct pio_buf *pbuf; + LIST_HEAD(wake_list); if (!sc) return; @@ -912,19 +913,21 @@ void sc_disable(struct send_context *sc) spin_unlock(&sc->release_lock); write_seqlock(&sc->waitlock); - while (!list_empty(&sc->piowait)) { + if (!list_empty(&sc->piowait)) + list_move(&sc->piowait, &wake_list); + write_sequnlock(&sc->waitlock); + while (!list_empty(&wake_list)) { struct iowait *wait; struct rvt_qp *qp; struct hfi1_qp_priv *priv; - wait = list_first_entry(&sc->piowait, struct iowait, list); + wait = list_first_entry(&wake_list, struct iowait, list); qp = iowait_to_qp(wait); priv = qp->priv; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); } - write_sequnlock(&sc->waitlock); spin_unlock_irq(&sc->alloc_lock); } diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 5fb92de1f015..9b544a3b1288 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -1092,12 +1092,12 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info) if (cq->avoid_mem_cflct) { ext_cqe = (__le64 *)((u8 *)cqe + 32); get_64bit_val(ext_cqe, 24, &qword7); - polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); + polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword7); } else { peek_head = (cq->cq_ring.head + 1) % cq->cq_ring.size; ext_cqe = cq->cq_base[peek_head].buf; get_64bit_val(ext_cqe, 24, &qword7); - polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); + polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword7); if (!peek_head) polarity ^= 1; } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 7110ebf834f9..102dc9342f2a 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3399,9 +3399,13 @@ static void irdma_process_cqe(struct ib_wc *entry, } if (cq_poll_info->ud_vlan_valid) { - entry->vlan_id = cq_poll_info->ud_vlan & VLAN_VID_MASK; - entry->wc_flags |= IB_WC_WITH_VLAN; + u16 vlan = cq_poll_info->ud_vlan & VLAN_VID_MASK; + entry->sl = cq_poll_info->ud_vlan >> VLAN_PRIO_SHIFT; + if (vlan) { + entry->vlan_id = vlan; + entry->wc_flags |= IB_WC_WITH_VLAN; + } } else { entry->sl = 0; } diff --git a/drivers/infiniband/hw/irdma/ws.c b/drivers/infiniband/hw/irdma/ws.c index b68c575eb78e..b0d6ee0739f5 100644 --- a/drivers/infiniband/hw/irdma/ws.c +++ b/drivers/infiniband/hw/irdma/ws.c @@ -330,8 +330,10 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) tc_node->enable = true; ret = irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_MODIFY_NODE); - if (ret) + if (ret) { + vsi->unregister_qset(vsi, tc_node); goto reg_err; + } } ibdev_dbg(to_ibdev(vsi->dev), "WS: Using node %d which represents VSI %d TC %d\n", @@ -350,6 +352,10 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) } goto exit; +reg_err: + irdma_ws_cqp_cmd(vsi, tc_node, IRDMA_OP_WS_DELETE_NODE); + list_del(&tc_node->siblings); + irdma_free_node(vsi, tc_node); leaf_add_err: if (list_empty(&vsi_node->child_list_head)) { if (irdma_ws_cqp_cmd(vsi, vsi_node, IRDMA_OP_WS_DELETE_NODE)) @@ -369,11 +375,6 @@ vsi_add_err: exit: mutex_unlock(&vsi->dev->ws_mutex); return ret; - -reg_err: - mutex_unlock(&vsi->dev->ws_mutex); - irdma_ws_remove(vsi, user_pri); - return ret; } /** diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 3be36ebbf67a..22e2f4d79743 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1339,7 +1339,6 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, goto err_2; } mr->mmkey.type = MLX5_MKEY_MR; - mr->desc_size = sizeof(struct mlx5_mtt); mr->umem = umem; set_mr_fields(dev, mr, umem->length, access_flags); kvfree(in); @@ -1533,6 +1532,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, ib_umem_release(&odp->umem); return ERR_CAST(mr); } + xa_init(&mr->implicit_children); odp->private = mr; err = mlx5r_store_odp_mkey(dev, &mr->mmkey); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index b2fca110346c..e5abbcfc1d57 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4458,6 +4458,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, mtu, attr->path_mtu); MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); + if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 3cb4febaad0f..8def88cfa300 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -455,6 +455,7 @@ struct qedr_qp { /* synchronization objects used with iwarp ep */ struct kref refcnt; struct completion iwarp_cm_comp; + struct completion qp_rel_comp; unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */ }; diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c index 1715fbe0719d..a51fc6854984 100644 --- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c @@ -83,7 +83,7 @@ static void qedr_iw_free_qp(struct kref *ref) { struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt); - kfree(qp); + complete(&qp->qp_rel_comp); } static void diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 3fbf172dbbef..dcb3653db72d 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1357,6 +1357,7 @@ static void qedr_set_common_qp_params(struct qedr_dev *dev, if (rdma_protocol_iwarp(&dev->ibdev, 1)) { kref_init(&qp->refcnt); init_completion(&qp->iwarp_cm_comp); + init_completion(&qp->qp_rel_comp); } qp->pd = pd; @@ -2857,8 +2858,10 @@ int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) qedr_free_qp_resources(dev, qp, udata); - if (rdma_protocol_iwarp(&dev->ibdev, 1)) + if (rdma_protocol_iwarp(&dev->ibdev, 1)) { qedr_iw_qp_rem_ref(&qp->ibqp); + wait_for_completion(&qp->qp_rel_comp); + } return 0; } diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index a67599b5a550..ac11943a5ddb 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -602,7 +602,7 @@ done: /* * How many pages in this iovec element? */ -static int qib_user_sdma_num_pages(const struct iovec *iov) +static size_t qib_user_sdma_num_pages(const struct iovec *iov) { const unsigned long addr = (unsigned long) iov->iov_base; const unsigned long len = iov->iov_len; @@ -658,7 +658,7 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev, static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, struct qib_user_sdma_queue *pq, struct qib_user_sdma_pkt *pkt, - unsigned long addr, int tlen, int npages) + unsigned long addr, int tlen, size_t npages) { struct page *pages[8]; int i, j; @@ -722,7 +722,7 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd, unsigned long idx; for (idx = 0; idx < niov; idx++) { - const int npages = qib_user_sdma_num_pages(iov + idx); + const size_t npages = qib_user_sdma_num_pages(iov + idx); const unsigned long addr = (unsigned long) iov[idx].iov_base; ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr, @@ -824,8 +824,8 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, unsigned pktnw; unsigned pktnwc; int nfrags = 0; - int npages = 0; - int bytes_togo = 0; + size_t npages = 0; + size_t bytes_togo = 0; int tiddma = 0; int cfur; @@ -885,7 +885,11 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, npages += qib_user_sdma_num_pages(&iov[idx]); - bytes_togo += slen; + if (check_add_overflow(bytes_togo, slen, &bytes_togo) || + bytes_togo > type_max(typeof(pkt->bytes_togo))) { + ret = -EINVAL; + goto free_pbc; + } pktnwc += slen >> 2; idx++; nfrags++; @@ -904,8 +908,7 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, } if (frag_size) { - int tidsmsize, n; - size_t pktsize; + size_t tidsmsize, n, pktsize, sz, addrlimit; n = npages*((2*PAGE_SIZE/frag_size)+1); pktsize = struct_size(pkt, addr, n); @@ -923,14 +926,24 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, else tidsmsize = 0; - pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL); + if (check_add_overflow(pktsize, tidsmsize, &sz)) { + ret = -EINVAL; + goto free_pbc; + } + pkt = kmalloc(sz, GFP_KERNEL); if (!pkt) { ret = -ENOMEM; goto free_pbc; } pkt->largepkt = 1; pkt->frag_size = frag_size; - pkt->addrlimit = n + ARRAY_SIZE(pkt->addr); + if (check_add_overflow(n, ARRAY_SIZE(pkt->addr), + &addrlimit) || + addrlimit > type_max(typeof(pkt->addrlimit))) { + ret = -EINVAL; + goto free_pbc; + } + pkt->addrlimit = addrlimit; if (tiddma) { char *tidsm = (char *)pkt + pktsize; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 49bdd78ac664..3305f2744bfa 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1223,7 +1223,7 @@ int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, spin_lock(&rdi->n_qps_lock); if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) { spin_unlock(&rdi->n_qps_lock); - ret = ENOMEM; + ret = -ENOMEM; goto bail_ip; } diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 29de8412e416..4c914f75a902 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -334,6 +334,7 @@ static const struct xpad_device { { 0x24c6, 0x5b03, "Thrustmaster Ferrari 458 Racing Wheel", 0, XTYPE_XBOX360 }, { 0x24c6, 0x5d04, "Razer Sabertooth", 0, XTYPE_XBOX360 }, { 0x24c6, 0xfafe, "Rock Candy Gamepad for Xbox 360", 0, XTYPE_XBOX360 }, + { 0x3285, 0x0607, "Nacon GC-100", 0, XTYPE_XBOX360 }, { 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX }, { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, { 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN } @@ -451,6 +452,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOXONE_VENDOR(0x24c6), /* PowerA Controllers */ XPAD_XBOXONE_VENDOR(0x2e24), /* Hyperkin Duke X-Box One pad */ XPAD_XBOX360_VENDOR(0x2f24), /* GameSir Controllers */ + XPAD_XBOX360_VENDOR(0x3285), /* Nacon GC-100 */ { } }; diff --git a/drivers/input/keyboard/snvs_pwrkey.c b/drivers/input/keyboard/snvs_pwrkey.c index 2f5e3ab5ed63..65286762b02a 100644 --- a/drivers/input/keyboard/snvs_pwrkey.c +++ b/drivers/input/keyboard/snvs_pwrkey.c @@ -3,6 +3,7 @@ // Driver for the IMX SNVS ON/OFF Power Key // Copyright (C) 2015 Freescale Semiconductor, Inc. All Rights Reserved. +#include <linux/clk.h> #include <linux/device.h> #include <linux/err.h> #include <linux/init.h> @@ -99,6 +100,11 @@ static irqreturn_t imx_snvs_pwrkey_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static void imx_snvs_pwrkey_disable_clk(void *data) +{ + clk_disable_unprepare(data); +} + static void imx_snvs_pwrkey_act(void *pdata) { struct pwrkey_drv_data *pd = pdata; @@ -111,6 +117,7 @@ static int imx_snvs_pwrkey_probe(struct platform_device *pdev) struct pwrkey_drv_data *pdata; struct input_dev *input; struct device_node *np; + struct clk *clk; int error; u32 vid; @@ -134,6 +141,28 @@ static int imx_snvs_pwrkey_probe(struct platform_device *pdev) dev_warn(&pdev->dev, "KEY_POWER without setting in dts\n"); } + clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(clk)) { + dev_err(&pdev->dev, "Failed to get snvs clock (%pe)\n", clk); + return PTR_ERR(clk); + } + + error = clk_prepare_enable(clk); + if (error) { + dev_err(&pdev->dev, "Failed to enable snvs clock (%pe)\n", + ERR_PTR(error)); + return error; + } + + error = devm_add_action_or_reset(&pdev->dev, + imx_snvs_pwrkey_disable_clk, clk); + if (error) { + dev_err(&pdev->dev, + "Failed to register clock cleanup handler (%pe)\n", + ERR_PTR(error)); + return error; + } + pdata->wakeup = of_property_read_bool(np, "wakeup-source"); pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/input/touchscreen.c b/drivers/input/touchscreen.c index dd18cb917c4d..4620e20d0190 100644 --- a/drivers/input/touchscreen.c +++ b/drivers/input/touchscreen.c @@ -80,27 +80,27 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch, data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-x", input_abs_get_min(input, axis_x), - &minimum) | - touchscreen_get_prop_u32(dev, "touchscreen-size-x", - input_abs_get_max(input, - axis_x) + 1, - &maximum) | - touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x", - input_abs_get_fuzz(input, axis_x), - &fuzz); + &minimum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-size-x", + input_abs_get_max(input, + axis_x) + 1, + &maximum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x", + input_abs_get_fuzz(input, axis_x), + &fuzz); if (data_present) touchscreen_set_params(input, axis_x, minimum, maximum - 1, fuzz); data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-y", input_abs_get_min(input, axis_y), - &minimum) | - touchscreen_get_prop_u32(dev, "touchscreen-size-y", - input_abs_get_max(input, - axis_y) + 1, - &maximum) | - touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y", - input_abs_get_fuzz(input, axis_y), - &fuzz); + &minimum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-size-y", + input_abs_get_max(input, + axis_y) + 1, + &maximum); + data_present |= touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y", + input_abs_get_fuzz(input, axis_y), + &fuzz); if (data_present) touchscreen_set_params(input, axis_y, minimum, maximum - 1, fuzz); @@ -108,11 +108,11 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch, data_present = touchscreen_get_prop_u32(dev, "touchscreen-max-pressure", input_abs_get_max(input, axis), - &maximum) | - touchscreen_get_prop_u32(dev, - "touchscreen-fuzz-pressure", - input_abs_get_fuzz(input, axis), - &fuzz); + &maximum); + data_present |= touchscreen_get_prop_u32(dev, + "touchscreen-fuzz-pressure", + input_abs_get_fuzz(input, axis), + &fuzz); if (data_present) touchscreen_set_params(input, axis, 0, maximum, fuzz); diff --git a/drivers/input/touchscreen/resistive-adc-touch.c b/drivers/input/touchscreen/resistive-adc-touch.c index 744544a723b7..6f754a8d30b1 100644 --- a/drivers/input/touchscreen/resistive-adc-touch.c +++ b/drivers/input/touchscreen/resistive-adc-touch.c @@ -71,19 +71,22 @@ static int grts_cb(const void *data, void *private) unsigned int z2 = touch_info[st->ch_map[GRTS_CH_Z2]]; unsigned int Rt; - Rt = z2; - Rt -= z1; - Rt *= st->x_plate_ohms; - Rt = DIV_ROUND_CLOSEST(Rt, 16); - Rt *= x; - Rt /= z1; - Rt = DIV_ROUND_CLOSEST(Rt, 256); - /* - * On increased pressure the resistance (Rt) is decreasing - * so, convert values to make it looks as real pressure. - */ - if (Rt < GRTS_DEFAULT_PRESSURE_MAX) - press = GRTS_DEFAULT_PRESSURE_MAX - Rt; + if (likely(x && z1)) { + Rt = z2; + Rt -= z1; + Rt *= st->x_plate_ohms; + Rt = DIV_ROUND_CLOSEST(Rt, 16); + Rt *= x; + Rt /= z1; + Rt = DIV_ROUND_CLOSEST(Rt, 256); + /* + * On increased pressure the resistance (Rt) is + * decreasing so, convert values to make it looks as + * real pressure. + */ + if (Rt < GRTS_DEFAULT_PRESSURE_MAX) + press = GRTS_DEFAULT_PRESSURE_MAX - Rt; + } } if ((!x && !y) || (st->pressure && (press < st->pressure_min))) { diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 124c41adeca1..3eb68fa1b8cc 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -308,7 +308,6 @@ config APPLE_DART config ARM_SMMU tristate "ARM Ltd. System MMU (SMMU) Support" depends on ARM64 || ARM || (COMPILE_TEST && !GENERIC_ATOMIC64) - depends on QCOM_SCM || !QCOM_SCM #if QCOM_SCM=m this can't be =y select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU if ARM @@ -356,6 +355,14 @@ config ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT 'arm-smmu.disable_bypass' will continue to override this config. +config ARM_SMMU_QCOM + def_tristate y + depends on ARM_SMMU && ARCH_QCOM + select QCOM_SCM + help + When running on a Qualcomm platform that has the custom variant + of the ARM SMMU, this needs to be built into the SMMU driver. + config ARM_SMMU_V3 tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support" depends on ARM64 @@ -438,7 +445,7 @@ config QCOM_IOMMU # Note: iommu drivers cannot (yet?) be built as modules bool "Qualcomm IOMMU Support" depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64) - depends on QCOM_SCM=y + select QCOM_SCM select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU diff --git a/drivers/iommu/arm/arm-smmu/Makefile b/drivers/iommu/arm/arm-smmu/Makefile index e240a7bcf310..b0cc01aa20c9 100644 --- a/drivers/iommu/arm/arm-smmu/Makefile +++ b/drivers/iommu/arm/arm-smmu/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o obj-$(CONFIG_ARM_SMMU) += arm_smmu.o -arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o arm-smmu-qcom.o +arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o +arm_smmu-$(CONFIG_ARM_SMMU_QCOM) += arm-smmu-qcom.o diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index 9f465e146799..2c25cce38060 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -215,7 +215,8 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu) of_device_is_compatible(np, "nvidia,tegra186-smmu")) return nvidia_smmu_impl_init(smmu); - smmu = qcom_smmu_impl_init(smmu); + if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM)) + smmu = qcom_smmu_impl_init(smmu); if (of_device_is_compatible(np, "marvell,ap806-smmu-500")) smmu->impl = &mrvl_mmu500_impl; diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index aca7b595c4c7..7038957f4a77 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -115,18 +115,24 @@ config BCM6345_L1_IRQ select GENERIC_IRQ_EFFECTIVE_AFF_MASK config BCM7038_L1_IRQ - bool + tristate "Broadcom STB 7038-style L1/L2 interrupt controller driver" + depends on ARCH_BRCMSTB || BMIPS_GENERIC + default ARCH_BRCMSTB || BMIPS_GENERIC select GENERIC_IRQ_CHIP select IRQ_DOMAIN select GENERIC_IRQ_EFFECTIVE_AFF_MASK config BCM7120_L2_IRQ - bool + tristate "Broadcom STB 7120-style L2 interrupt controller driver" + depends on ARCH_BRCMSTB || BMIPS_GENERIC + default ARCH_BRCMSTB || BMIPS_GENERIC select GENERIC_IRQ_CHIP select IRQ_DOMAIN config BRCMSTB_L2_IRQ - bool + tristate "Broadcom STB generic L2 interrupt controller driver" + depends on ARCH_BCM2835 || ARCH_BRCMSTB || BMIPS_GENERIC + default ARCH_BCM2835 || ARCH_BRCMSTB || BMIPS_GENERIC select GENERIC_IRQ_CHIP select IRQ_DOMAIN @@ -400,8 +406,9 @@ config IRQ_UNIPHIER_AIDET Support for the UniPhier AIDET (ARM Interrupt Detector). config MESON_IRQ_GPIO - bool "Meson GPIO Interrupt Multiplexer" - depends on ARCH_MESON + tristate "Meson GPIO Interrupt Multiplexer" + depends on ARCH_MESON || COMPILE_TEST + default ARCH_MESON select IRQ_DOMAIN_HIERARCHY help Support Meson SoC Family GPIO Interrupt Multiplexer @@ -602,4 +609,12 @@ config APPLE_AIC Support for the Apple Interrupt Controller found on Apple Silicon SoCs, such as the M1. +config MCHP_EIC + bool "Microchip External Interrupt Controller" + depends on ARCH_AT91 || COMPILE_TEST + select IRQ_DOMAIN + select IRQ_DOMAIN_HIERARCHY + help + Support for Microchip External Interrupt Controller. + endmenu diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index f88cbf36a9d2..c1f611cbfbf8 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -116,3 +116,4 @@ obj-$(CONFIG_MACH_REALTEK_RTL) += irq-realtek-rtl.o obj-$(CONFIG_WPCM450_AIC) += irq-wpcm450-aic.o obj-$(CONFIG_IRQ_IDT3243X) += irq-idt3243x.o obj-$(CONFIG_APPLE_AIC) += irq-apple-aic.o +obj-$(CONFIG_MCHP_EIC) += irq-mchp-eic.o diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 6fc145aacaf0..3759dc36cc8f 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -245,7 +245,7 @@ static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs) irq = FIELD_GET(AIC_EVENT_NUM, event); if (type == AIC_EVENT_TYPE_HW) - handle_domain_irq(aic_irqc->hw_domain, irq, regs); + generic_handle_domain_irq(aic_irqc->hw_domain, irq); else if (type == AIC_EVENT_TYPE_IPI && irq == 1) aic_handle_ipi(regs); else if (event != 0) @@ -392,25 +392,25 @@ static void __exception_irq_entry aic_handle_fiq(struct pt_regs *regs) } if (TIMER_FIRING(read_sysreg(cntp_ctl_el0))) - handle_domain_irq(aic_irqc->hw_domain, - aic_irqc->nr_hw + AIC_TMR_EL0_PHYS, regs); + generic_handle_domain_irq(aic_irqc->hw_domain, + aic_irqc->nr_hw + AIC_TMR_EL0_PHYS); if (TIMER_FIRING(read_sysreg(cntv_ctl_el0))) - handle_domain_irq(aic_irqc->hw_domain, - aic_irqc->nr_hw + AIC_TMR_EL0_VIRT, regs); + generic_handle_domain_irq(aic_irqc->hw_domain, + aic_irqc->nr_hw + AIC_TMR_EL0_VIRT); if (is_kernel_in_hyp_mode()) { uint64_t enabled = read_sysreg_s(SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2); if ((enabled & VM_TMR_FIQ_ENABLE_P) && TIMER_FIRING(read_sysreg_s(SYS_CNTP_CTL_EL02))) - handle_domain_irq(aic_irqc->hw_domain, - aic_irqc->nr_hw + AIC_TMR_EL02_PHYS, regs); + generic_handle_domain_irq(aic_irqc->hw_domain, + aic_irqc->nr_hw + AIC_TMR_EL02_PHYS); if ((enabled & VM_TMR_FIQ_ENABLE_V) && TIMER_FIRING(read_sysreg_s(SYS_CNTV_CTL_EL02))) - handle_domain_irq(aic_irqc->hw_domain, - aic_irqc->nr_hw + AIC_TMR_EL02_VIRT, regs); + generic_handle_domain_irq(aic_irqc->hw_domain, + aic_irqc->nr_hw + AIC_TMR_EL02_VIRT); } if ((read_sysreg_s(SYS_IMP_APL_PMCR0_EL1) & (PMCR0_IMODE | PMCR0_IACT)) == @@ -674,7 +674,7 @@ static void aic_handle_ipi(struct pt_regs *regs) firing = atomic_fetch_andnot(enabled, this_cpu_ptr(&aic_vipi_flag)) & enabled; for_each_set_bit(i, &firing, AIC_NR_SWIPI) - handle_domain_irq(aic_irqc->ipi_domain, i, regs); + generic_handle_domain_irq(aic_irqc->ipi_domain, i); /* * No ordering needed here; at worst this just changes the timing of diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index 53e0fb0562c1..80906bfec845 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -589,12 +589,7 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained) irq = msinr - PCI_MSI_DOORBELL_START; - if (is_chained) - generic_handle_domain_irq(armada_370_xp_msi_inner_domain, - irq); - else - handle_domain_irq(armada_370_xp_msi_inner_domain, - irq, regs); + generic_handle_domain_irq(armada_370_xp_msi_inner_domain, irq); } } #else @@ -646,8 +641,8 @@ armada_370_xp_handle_irq(struct pt_regs *regs) break; if (irqnr > 1) { - handle_domain_irq(armada_370_xp_mpic_domain, - irqnr, regs); + generic_handle_domain_irq(armada_370_xp_mpic_domain, + irqnr); continue; } @@ -666,7 +661,7 @@ armada_370_xp_handle_irq(struct pt_regs *regs) & IPI_DOORBELL_MASK; for_each_set_bit(ipi, &ipimask, IPI_DOORBELL_END) - handle_domain_irq(ipi_domain, ipi, regs); + generic_handle_domain_irq(ipi_domain, ipi); } #endif diff --git a/drivers/irqchip/irq-aspeed-vic.c b/drivers/irqchip/irq-aspeed-vic.c index 58717cd44f99..62ccf2c0c414 100644 --- a/drivers/irqchip/irq-aspeed-vic.c +++ b/drivers/irqchip/irq-aspeed-vic.c @@ -100,7 +100,7 @@ static void __exception_irq_entry avic_handle_irq(struct pt_regs *regs) if (stat == 0) break; irq += ffs(stat) - 1; - handle_domain_irq(vic->dom, irq, regs); + generic_handle_domain_irq(vic->dom, irq); } } diff --git a/drivers/irqchip/irq-ativic32.c b/drivers/irqchip/irq-ativic32.c index 476d6024aaf2..223dd2f97d28 100644 --- a/drivers/irqchip/irq-ativic32.c +++ b/drivers/irqchip/irq-ativic32.c @@ -5,11 +5,14 @@ #include <linux/of.h> #include <linux/of_irq.h> #include <linux/of_address.h> +#include <linux/hardirq.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/irqchip.h> #include <nds32_intrinsic.h> +#include <asm/irq_regs.h> + unsigned long wake_mask; static void ativic32_ack_irq(struct irq_data *data) @@ -103,10 +106,25 @@ static irq_hw_number_t get_intr_src(void) - NDS32_VECTOR_offINTERRUPT; } -asmlinkage void asm_do_IRQ(struct pt_regs *regs) +static void ativic32_handle_irq(struct pt_regs *regs) { irq_hw_number_t hwirq = get_intr_src(); - handle_domain_irq(root_domain, hwirq, regs); + generic_handle_domain_irq(root_domain, hwirq); +} + +/* + * TODO: convert nds32 to GENERIC_IRQ_MULTI_HANDLER so that this entry logic + * can live in arch code. + */ +asmlinkage void asm_do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + + irq_enter(); + old_regs = set_irq_regs(regs); + ativic32_handle_irq(regs); + set_irq_regs(old_regs); + irq_exit(); } int __init ativic32_init_irq(struct device_node *node, struct device_node *parent) diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c index 2c999dc310c1..4631f6847953 100644 --- a/drivers/irqchip/irq-atmel-aic.c +++ b/drivers/irqchip/irq-atmel-aic.c @@ -71,7 +71,7 @@ aic_handle(struct pt_regs *regs) if (!irqstat) irq_reg_writel(gc, 0, AT91_AIC_EOICR); else - handle_domain_irq(aic_domain, irqnr, regs); + generic_handle_domain_irq(aic_domain, irqnr); } static int aic_retrigger(struct irq_data *d) diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index fb4ad2aaa727..145535bd7560 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -80,7 +80,7 @@ aic5_handle(struct pt_regs *regs) if (!irqstat) irq_reg_writel(bgc, 0, AT91_AIC5_EOICR); else - handle_domain_irq(aic5_domain, irqnr, regs); + generic_handle_domain_irq(aic5_domain, irqnr); } static void aic5_mask(struct irq_data *d) diff --git a/drivers/irqchip/irq-bcm2835.c b/drivers/irqchip/irq-bcm2835.c index adc1556ed332..e94e2882286c 100644 --- a/drivers/irqchip/irq-bcm2835.c +++ b/drivers/irqchip/irq-bcm2835.c @@ -246,7 +246,7 @@ static void __exception_irq_entry bcm2835_handle_irq( u32 hwirq; while ((hwirq = get_next_armctrl_hwirq()) != ~0) - handle_domain_irq(intc.domain, hwirq, regs); + generic_handle_domain_irq(intc.domain, hwirq); } static void bcm2836_chained_handle_irq(struct irq_desc *desc) diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c index 501facdb4570..51491c3c6fdd 100644 --- a/drivers/irqchip/irq-bcm2836.c +++ b/drivers/irqchip/irq-bcm2836.c @@ -143,7 +143,7 @@ __exception_irq_entry bcm2836_arm_irqchip_handle_irq(struct pt_regs *regs) if (stat) { u32 hwirq = ffs(stat) - 1; - handle_domain_irq(intc.domain, hwirq, regs); + generic_handle_domain_irq(intc.domain, hwirq); } } diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c index e3483789f4df..fd079215c17f 100644 --- a/drivers/irqchip/irq-bcm6345-l1.c +++ b/drivers/irqchip/irq-bcm6345-l1.c @@ -132,16 +132,12 @@ static void bcm6345_l1_irq_handle(struct irq_desc *desc) int base = idx * IRQS_PER_WORD; unsigned long pending; irq_hw_number_t hwirq; - unsigned int irq; pending = __raw_readl(cpu->map_base + reg_status(intc, idx)); pending &= __raw_readl(cpu->map_base + reg_enable(intc, idx)); for_each_set_bit(hwirq, &pending, IRQS_PER_WORD) { - irq = irq_linear_revmap(intc->domain, base + hwirq); - if (irq) - do_IRQ(irq); - else + if (generic_handle_domain_irq(intc->domain, base + hwirq)) spurious_interrupt(); } } diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index a035c385ca7a..a62b96237b82 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -28,9 +28,6 @@ #include <linux/irqchip.h> #include <linux/irqchip/chained_irq.h> #include <linux/syscore_ops.h> -#ifdef CONFIG_ARM -#include <asm/smp_plat.h> -#endif #define IRQS_PER_WORD 32 #define REG_BYTES_PER_IRQ_WORD (sizeof(u32) * 4) @@ -127,7 +124,7 @@ static void bcm7038_l1_irq_handle(struct irq_desc *desc) struct irq_chip *chip = irq_desc_get_chip(desc); unsigned int idx; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_MIPS) cpu = intc->cpus[cpu_logical_map(smp_processor_id())]; #else cpu = intc->cpus[0]; @@ -194,6 +191,7 @@ static void bcm7038_l1_mask(struct irq_data *d) raw_spin_unlock_irqrestore(&intc->lock, flags); } +#if defined(CONFIG_MIPS) && defined(CONFIG_SMP) static int bcm7038_l1_set_affinity(struct irq_data *d, const struct cpumask *dest, bool force) @@ -220,32 +218,6 @@ static int bcm7038_l1_set_affinity(struct irq_data *d, return 0; } - -#ifdef CONFIG_SMP -static void bcm7038_l1_cpu_offline(struct irq_data *d) -{ - struct cpumask *mask = irq_data_get_affinity_mask(d); - int cpu = smp_processor_id(); - cpumask_t new_affinity; - - /* This CPU was not on the affinity mask */ - if (!cpumask_test_cpu(cpu, mask)) - return; - - if (cpumask_weight(mask) > 1) { - /* - * Multiple CPU affinity, remove this CPU from the affinity - * mask - */ - cpumask_copy(&new_affinity, mask); - cpumask_clear_cpu(cpu, &new_affinity); - } else { - /* Only CPU, put on the lowest online CPU */ - cpumask_clear(&new_affinity); - cpumask_set_cpu(cpumask_first(cpu_online_mask), &new_affinity); - } - irq_set_affinity_locked(d, &new_affinity, false); -} #endif static int __init bcm7038_l1_init_one(struct device_node *dn, @@ -328,7 +300,7 @@ static int bcm7038_l1_suspend(void) u32 val; /* Wakeup interrupt should only come from the boot cpu */ -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_MIPS) boot_cpu = cpu_logical_map(0); #else boot_cpu = 0; @@ -352,7 +324,7 @@ static void bcm7038_l1_resume(void) struct bcm7038_l1_chip *intc; int boot_cpu, word; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_MIPS) boot_cpu = cpu_logical_map(0); #else boot_cpu = 0; @@ -395,9 +367,8 @@ static struct irq_chip bcm7038_l1_irq_chip = { .name = "bcm7038-l1", .irq_mask = bcm7038_l1_mask, .irq_unmask = bcm7038_l1_unmask, +#if defined(CONFIG_SMP) && defined(CONFIG_MIPS) .irq_set_affinity = bcm7038_l1_set_affinity, -#ifdef CONFIG_SMP - .irq_cpu_offline = bcm7038_l1_cpu_offline, #endif #ifdef CONFIG_PM_SLEEP .irq_set_wake = bcm7038_l1_set_wake, @@ -416,7 +387,7 @@ static int bcm7038_l1_map(struct irq_domain *d, unsigned int virq, irq_set_chip_and_handler(virq, &bcm7038_l1_irq_chip, handle_level_irq); irq_set_chip_data(virq, d->host_data); - irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(virq))); + irqd_set_single_target(irq_get_irq_data(virq)); return 0; } @@ -484,4 +455,8 @@ out_free: return ret; } -IRQCHIP_DECLARE(bcm7038_l1, "brcm,bcm7038-l1-intc", bcm7038_l1_of_init); +IRQCHIP_PLATFORM_DRIVER_BEGIN(bcm7038_l1) +IRQCHIP_MATCH("brcm,bcm7038-l1-intc", bcm7038_l1_of_init) +IRQCHIP_PLATFORM_DRIVER_END(bcm7038_l1) +MODULE_DESCRIPTION("Broadcom STB 7038-style L1/L2 interrupt controller"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c index f23d7651ea84..d80e67a6aad2 100644 --- a/drivers/irqchip/irq-bcm7120-l2.c +++ b/drivers/irqchip/irq-bcm7120-l2.c @@ -220,6 +220,7 @@ static int __init bcm7120_l2_intc_probe(struct device_node *dn, { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; struct bcm7120_l2_intc_data *data; + struct platform_device *pdev; struct irq_chip_generic *gc; struct irq_chip_type *ct; int ret = 0; @@ -230,7 +231,13 @@ static int __init bcm7120_l2_intc_probe(struct device_node *dn, if (!data) return -ENOMEM; - data->num_parent_irqs = of_irq_count(dn); + pdev = of_find_device_by_node(dn); + if (!pdev) { + ret = -ENODEV; + goto out_free_data; + } + + data->num_parent_irqs = platform_irq_count(pdev); if (data->num_parent_irqs <= 0) { pr_err("invalid number of parent interrupts\n"); ret = -ENOMEM; @@ -329,6 +336,7 @@ out_unmap: if (data->map_base[idx]) iounmap(data->map_base[idx]); } +out_free_data: kfree(data); return ret; } @@ -347,8 +355,9 @@ static int __init bcm7120_l2_intc_probe_3380(struct device_node *dn, "BCM3380 L2"); } -IRQCHIP_DECLARE(bcm7120_l2_intc, "brcm,bcm7120-l2-intc", - bcm7120_l2_intc_probe_7120); - -IRQCHIP_DECLARE(bcm3380_l2_intc, "brcm,bcm3380-l2-intc", - bcm7120_l2_intc_probe_3380); +IRQCHIP_PLATFORM_DRIVER_BEGIN(bcm7120_l2) +IRQCHIP_MATCH("brcm,bcm7120-l2-intc", bcm7120_l2_intc_probe_7120) +IRQCHIP_MATCH("brcm,bcm3380-l2-intc", bcm7120_l2_intc_probe_3380) +IRQCHIP_PLATFORM_DRIVER_END(bcm7120_l2) +MODULE_DESCRIPTION("Broadcom STB 7120-style L2 interrupt controller driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 8e0911561f2d..e4efc08ac594 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -275,16 +275,18 @@ static int __init brcmstb_l2_edge_intc_of_init(struct device_node *np, { return brcmstb_l2_intc_of_init(np, parent, &l2_edge_intc_init); } -IRQCHIP_DECLARE(brcmstb_l2_intc, "brcm,l2-intc", brcmstb_l2_edge_intc_of_init); -IRQCHIP_DECLARE(brcmstb_hif_spi_l2_intc, "brcm,hif-spi-l2-intc", - brcmstb_l2_edge_intc_of_init); -IRQCHIP_DECLARE(brcmstb_upg_aux_aon_l2_intc, "brcm,upg-aux-aon-l2-intc", - brcmstb_l2_edge_intc_of_init); static int __init brcmstb_l2_lvl_intc_of_init(struct device_node *np, struct device_node *parent) { return brcmstb_l2_intc_of_init(np, parent, &l2_lvl_intc_init); } -IRQCHIP_DECLARE(bcm7271_l2_intc, "brcm,bcm7271-l2-intc", - brcmstb_l2_lvl_intc_of_init); + +IRQCHIP_PLATFORM_DRIVER_BEGIN(brcmstb_l2) +IRQCHIP_MATCH("brcm,l2-intc", brcmstb_l2_edge_intc_of_init) +IRQCHIP_MATCH("brcm,hif-spi-l2-intc", brcmstb_l2_edge_intc_of_init) +IRQCHIP_MATCH("brcm,upg-aux-aon-l2-intc", brcmstb_l2_edge_intc_of_init) +IRQCHIP_MATCH("brcm,bcm7271-l2-intc", brcmstb_l2_lvl_intc_of_init) +IRQCHIP_PLATFORM_DRIVER_END(brcmstb_l2) +MODULE_DESCRIPTION("Broadcom STB generic L2 interrupt controller"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-clps711x.c b/drivers/irqchip/irq-clps711x.c index d0da29aeedc8..77ebe7e47e0e 100644 --- a/drivers/irqchip/irq-clps711x.c +++ b/drivers/irqchip/irq-clps711x.c @@ -77,14 +77,14 @@ static asmlinkage void __exception_irq_entry clps711x_irqh(struct pt_regs *regs) irqstat = readw_relaxed(clps711x_intc->intmr[0]) & readw_relaxed(clps711x_intc->intsr[0]); if (irqstat) - handle_domain_irq(clps711x_intc->domain, - fls(irqstat) - 1, regs); + generic_handle_domain_irq(clps711x_intc->domain, + fls(irqstat) - 1); irqstat = readw_relaxed(clps711x_intc->intmr[1]) & readw_relaxed(clps711x_intc->intsr[1]); if (irqstat) - handle_domain_irq(clps711x_intc->domain, - fls(irqstat) - 1 + 16, regs); + generic_handle_domain_irq(clps711x_intc->domain, + fls(irqstat) - 1 + 16); } while (irqstat); } diff --git a/drivers/irqchip/irq-csky-apb-intc.c b/drivers/irqchip/irq-csky-apb-intc.c index ab91afa86755..d36f536506ba 100644 --- a/drivers/irqchip/irq-csky-apb-intc.c +++ b/drivers/irqchip/irq-csky-apb-intc.c @@ -138,7 +138,7 @@ static inline bool handle_irq_perbit(struct pt_regs *regs, u32 hwirq, if (hwirq == 0) return 0; - handle_domain_irq(root_domain, irq_base + __fls(hwirq), regs); + generic_handle_domain_irq(root_domain, irq_base + __fls(hwirq)); return 1; } diff --git a/drivers/irqchip/irq-csky-mpintc.c b/drivers/irqchip/irq-csky-mpintc.c index a1534edef7fa..cb403c960ac0 100644 --- a/drivers/irqchip/irq-csky-mpintc.c +++ b/drivers/irqchip/irq-csky-mpintc.c @@ -74,8 +74,8 @@ static void csky_mpintc_handler(struct pt_regs *regs) { void __iomem *reg_base = this_cpu_read(intcl_reg); - handle_domain_irq(root_domain, - readl_relaxed(reg_base + INTCL_RDYIR), regs); + generic_handle_domain_irq(root_domain, + readl_relaxed(reg_base + INTCL_RDYIR)); } static void csky_mpintc_enable(struct irq_data *d) diff --git a/drivers/irqchip/irq-davinci-aintc.c b/drivers/irqchip/irq-davinci-aintc.c index 810ccc4fe476..123eb7bfc117 100644 --- a/drivers/irqchip/irq-davinci-aintc.c +++ b/drivers/irqchip/irq-davinci-aintc.c @@ -73,7 +73,7 @@ davinci_aintc_handle_irq(struct pt_regs *regs) irqnr >>= 2; irqnr -= 1; - handle_domain_irq(davinci_aintc_irq_domain, irqnr, regs); + generic_handle_domain_irq(davinci_aintc_irq_domain, irqnr); } /* ARM Interrupt Controller Initialization */ diff --git a/drivers/irqchip/irq-davinci-cp-intc.c b/drivers/irqchip/irq-davinci-cp-intc.c index 276da2772e7f..7482c8ed34b2 100644 --- a/drivers/irqchip/irq-davinci-cp-intc.c +++ b/drivers/irqchip/irq-davinci-cp-intc.c @@ -135,7 +135,7 @@ davinci_cp_intc_handle_irq(struct pt_regs *regs) return; } - handle_domain_irq(davinci_cp_intc_irq_domain, irqnr, regs); + generic_handle_domain_irq(davinci_cp_intc_irq_domain, irqnr); } static int davinci_cp_intc_host_map(struct irq_domain *h, unsigned int virq, diff --git a/drivers/irqchip/irq-digicolor.c b/drivers/irqchip/irq-digicolor.c index fc38d2da11b9..3b0d78aac13b 100644 --- a/drivers/irqchip/irq-digicolor.c +++ b/drivers/irqchip/irq-digicolor.c @@ -50,7 +50,7 @@ static void __exception_irq_entry digicolor_handle_irq(struct pt_regs *regs) return; } - handle_domain_irq(digicolor_irq_domain, hwirq, regs); + generic_handle_domain_irq(digicolor_irq_domain, hwirq); } while (1); } diff --git a/drivers/irqchip/irq-dw-apb-ictl.c b/drivers/irqchip/irq-dw-apb-ictl.c index a67266e44491..d5c1c750c8d2 100644 --- a/drivers/irqchip/irq-dw-apb-ictl.c +++ b/drivers/irqchip/irq-dw-apb-ictl.c @@ -42,7 +42,7 @@ static void __irq_entry dw_apb_ictl_handle_irq(struct pt_regs *regs) while (stat) { u32 hwirq = ffs(stat) - 1; - handle_domain_irq(d, hwirq, regs); + generic_handle_domain_irq(d, hwirq); stat &= ~BIT(hwirq); } } diff --git a/drivers/irqchip/irq-ftintc010.c b/drivers/irqchip/irq-ftintc010.c index 0bf98425dca5..5cc268880f8e 100644 --- a/drivers/irqchip/irq-ftintc010.c +++ b/drivers/irqchip/irq-ftintc010.c @@ -134,7 +134,7 @@ asmlinkage void __exception_irq_entry ft010_irqchip_handle_irq(struct pt_regs *r while ((status = readl(FT010_IRQ_STATUS(f->base)))) { irq = ffs(status) - 1; - handle_domain_irq(f->domain, irq, regs); + generic_handle_domain_irq(f->domain, irq); } } diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index fd4e9a37fea6..daec3309b014 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -660,7 +660,7 @@ static inline void gic_handle_nmi(u32 irqnr, struct pt_regs *regs) * PSR.I will be restored when we ERET to the * interrupted context. */ - err = handle_domain_nmi(gic_data.domain, irqnr, regs); + err = generic_handle_domain_nmi(gic_data.domain, irqnr); if (err) gic_deactivate_unhandled(irqnr); @@ -728,7 +728,7 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs else isb(); - if (handle_domain_irq(gic_data.domain, irqnr, regs)) { + if (generic_handle_domain_irq(gic_data.domain, irqnr)) { WARN_ONCE(true, "Unexpected interrupt received!\n"); gic_deactivate_unhandled(irqnr); } diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 5f22c9d65e57..b8bb46c65a97 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -369,7 +369,7 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs) this_cpu_write(sgi_intid, irqstat); } - handle_domain_irq(gic->domain, irqnr, regs); + generic_handle_domain_irq(gic->domain, irqnr); } while (1); } diff --git a/drivers/irqchip/irq-hip04.c b/drivers/irqchip/irq-hip04.c index 058ebaebe2c4..46161f6ff289 100644 --- a/drivers/irqchip/irq-hip04.c +++ b/drivers/irqchip/irq-hip04.c @@ -206,7 +206,7 @@ static void __exception_irq_entry hip04_handle_irq(struct pt_regs *regs) irqnr = irqstat & GICC_IAR_INT_ID_MASK; if (irqnr <= HIP04_MAX_IRQS) - handle_domain_irq(hip04_data.domain, irqnr, regs); + generic_handle_domain_irq(hip04_data.domain, irqnr); } while (irqnr > HIP04_MAX_IRQS); } diff --git a/drivers/irqchip/irq-ixp4xx.c b/drivers/irqchip/irq-ixp4xx.c index 37e0749215c7..fb68f8c59fbb 100644 --- a/drivers/irqchip/irq-ixp4xx.c +++ b/drivers/irqchip/irq-ixp4xx.c @@ -114,7 +114,7 @@ asmlinkage void __exception_irq_entry ixp4xx_handle_irq(struct pt_regs *regs) status = __raw_readl(ixi->irqbase + IXP4XX_ICIP); for_each_set_bit(i, &status, 32) - handle_domain_irq(ixi->domain, i, regs); + generic_handle_domain_irq(ixi->domain, i); /* * IXP465/IXP435 has an upper IRQ status register @@ -122,7 +122,7 @@ asmlinkage void __exception_irq_entry ixp4xx_handle_irq(struct pt_regs *regs) if (ixi->is_356) { status = __raw_readl(ixi->irqbase + IXP4XX_ICIP2); for_each_set_bit(i, &status, 32) - handle_domain_irq(ixi->domain, i + 32, regs); + generic_handle_domain_irq(ixi->domain, i + 32); } } diff --git a/drivers/irqchip/irq-lpc32xx.c b/drivers/irqchip/irq-lpc32xx.c index 5e6f6e25f2ae..a29357f39450 100644 --- a/drivers/irqchip/irq-lpc32xx.c +++ b/drivers/irqchip/irq-lpc32xx.c @@ -126,7 +126,7 @@ static void __exception_irq_entry lpc32xx_handle_irq(struct pt_regs *regs) while (hwirq) { irq = __ffs(hwirq); hwirq &= ~BIT(irq); - handle_domain_irq(lpc32xx_mic_irqc->domain, irq, regs); + generic_handle_domain_irq(lpc32xx_mic_irqc->domain, irq); } } diff --git a/drivers/irqchip/irq-mchp-eic.c b/drivers/irqchip/irq-mchp-eic.c new file mode 100644 index 000000000000..c726a19837d2 --- /dev/null +++ b/drivers/irqchip/irq-mchp-eic.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Microchip External Interrupt Controller driver + * + * Copyright (C) 2021 Microchip Technology Inc. and its subsidiaries + * + * Author: Claudiu Beznea <claudiu.beznea@microchip.com> + */ +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irqchip.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/syscore_ops.h> + +#include <dt-bindings/interrupt-controller/arm-gic.h> + +#define MCHP_EIC_GFCS (0x0) +#define MCHP_EIC_SCFG(x) (0x4 + (x) * 0x4) +#define MCHP_EIC_SCFG_EN BIT(16) +#define MCHP_EIC_SCFG_LVL BIT(9) +#define MCHP_EIC_SCFG_POL BIT(8) + +#define MCHP_EIC_NIRQ (2) + +/* + * struct mchp_eic - EIC private data structure + * @base: base address + * @clk: peripheral clock + * @domain: irq domain + * @irqs: irqs b/w eic and gic + * @scfg: backup for scfg registers (necessary for backup and self-refresh mode) + * @wakeup_source: wakeup source mask + */ +struct mchp_eic { + void __iomem *base; + struct clk *clk; + struct irq_domain *domain; + u32 irqs[MCHP_EIC_NIRQ]; + u32 scfg[MCHP_EIC_NIRQ]; + u32 wakeup_source; +}; + +static struct mchp_eic *eic; + +static void mchp_eic_irq_mask(struct irq_data *d) +{ + unsigned int tmp; + + tmp = readl_relaxed(eic->base + MCHP_EIC_SCFG(d->hwirq)); + tmp &= ~MCHP_EIC_SCFG_EN; + writel_relaxed(tmp, eic->base + MCHP_EIC_SCFG(d->hwirq)); + + irq_chip_mask_parent(d); +} + +static void mchp_eic_irq_unmask(struct irq_data *d) +{ + unsigned int tmp; + + tmp = readl_relaxed(eic->base + MCHP_EIC_SCFG(d->hwirq)); + tmp |= MCHP_EIC_SCFG_EN; + writel_relaxed(tmp, eic->base + MCHP_EIC_SCFG(d->hwirq)); + + irq_chip_unmask_parent(d); +} + +static int mchp_eic_irq_set_type(struct irq_data *d, unsigned int type) +{ + unsigned int parent_irq_type; + unsigned int tmp; + + tmp = readl_relaxed(eic->base + MCHP_EIC_SCFG(d->hwirq)); + tmp &= ~(MCHP_EIC_SCFG_POL | MCHP_EIC_SCFG_LVL); + switch (type) { + case IRQ_TYPE_LEVEL_HIGH: + tmp |= MCHP_EIC_SCFG_POL | MCHP_EIC_SCFG_LVL; + parent_irq_type = IRQ_TYPE_LEVEL_HIGH; + break; + case IRQ_TYPE_LEVEL_LOW: + tmp |= MCHP_EIC_SCFG_LVL; + parent_irq_type = IRQ_TYPE_LEVEL_HIGH; + break; + case IRQ_TYPE_EDGE_RISING: + parent_irq_type = IRQ_TYPE_EDGE_RISING; + break; + case IRQ_TYPE_EDGE_FALLING: + tmp |= MCHP_EIC_SCFG_POL; + parent_irq_type = IRQ_TYPE_EDGE_RISING; + break; + default: + return -EINVAL; + } + + writel_relaxed(tmp, eic->base + MCHP_EIC_SCFG(d->hwirq)); + + return irq_chip_set_type_parent(d, parent_irq_type); +} + +static int mchp_eic_irq_set_wake(struct irq_data *d, unsigned int on) +{ + irq_set_irq_wake(eic->irqs[d->hwirq], on); + if (on) + eic->wakeup_source |= BIT(d->hwirq); + else + eic->wakeup_source &= ~BIT(d->hwirq); + + return 0; +} + +static int mchp_eic_irq_suspend(void) +{ + unsigned int hwirq; + + for (hwirq = 0; hwirq < MCHP_EIC_NIRQ; hwirq++) + eic->scfg[hwirq] = readl_relaxed(eic->base + + MCHP_EIC_SCFG(hwirq)); + + if (!eic->wakeup_source) + clk_disable_unprepare(eic->clk); + + return 0; +} + +static void mchp_eic_irq_resume(void) +{ + unsigned int hwirq; + + if (!eic->wakeup_source) + clk_prepare_enable(eic->clk); + + for (hwirq = 0; hwirq < MCHP_EIC_NIRQ; hwirq++) + writel_relaxed(eic->scfg[hwirq], eic->base + + MCHP_EIC_SCFG(hwirq)); +} + +static struct syscore_ops mchp_eic_syscore_ops = { + .suspend = mchp_eic_irq_suspend, + .resume = mchp_eic_irq_resume, +}; + +static struct irq_chip mchp_eic_chip = { + .name = "eic", + .flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_SET_TYPE_MASKED, + .irq_mask = mchp_eic_irq_mask, + .irq_unmask = mchp_eic_irq_unmask, + .irq_set_type = mchp_eic_irq_set_type, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_wake = mchp_eic_irq_set_wake, +}; + +static int mchp_eic_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *data) +{ + struct irq_fwspec *fwspec = data; + struct irq_fwspec parent_fwspec; + irq_hw_number_t hwirq; + unsigned int type; + int ret; + + if (WARN_ON(nr_irqs != 1)) + return -EINVAL; + + ret = irq_domain_translate_twocell(domain, fwspec, &hwirq, &type); + if (ret || hwirq >= MCHP_EIC_NIRQ) + return ret; + + switch (type) { + case IRQ_TYPE_EDGE_RISING: + case IRQ_TYPE_LEVEL_HIGH: + break; + case IRQ_TYPE_EDGE_FALLING: + type = IRQ_TYPE_EDGE_RISING; + break; + case IRQ_TYPE_LEVEL_LOW: + type = IRQ_TYPE_LEVEL_HIGH; + break; + default: + return -EINVAL; + } + + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, &mchp_eic_chip, eic); + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 3; + parent_fwspec.param[0] = GIC_SPI; + parent_fwspec.param[1] = eic->irqs[hwirq]; + parent_fwspec.param[2] = type; + + return irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); +} + +static const struct irq_domain_ops mchp_eic_domain_ops = { + .translate = irq_domain_translate_twocell, + .alloc = mchp_eic_domain_alloc, + .free = irq_domain_free_irqs_common, +}; + +static int mchp_eic_init(struct device_node *node, struct device_node *parent) +{ + struct irq_domain *parent_domain = NULL; + int ret, i; + + eic = kzalloc(sizeof(*eic), GFP_KERNEL); + if (!eic) + return -ENOMEM; + + eic->base = of_iomap(node, 0); + if (!eic->base) { + ret = -ENOMEM; + goto free; + } + + parent_domain = irq_find_host(parent); + if (!parent_domain) { + ret = -ENODEV; + goto unmap; + } + + eic->clk = of_clk_get_by_name(node, "pclk"); + if (IS_ERR(eic->clk)) { + ret = PTR_ERR(eic->clk); + goto unmap; + } + + ret = clk_prepare_enable(eic->clk); + if (ret) + goto unmap; + + for (i = 0; i < MCHP_EIC_NIRQ; i++) { + struct of_phandle_args irq; + + /* Disable it, if any. */ + writel_relaxed(0UL, eic->base + MCHP_EIC_SCFG(i)); + + ret = of_irq_parse_one(node, i, &irq); + if (ret) + goto clk_unprepare; + + if (WARN_ON(irq.args_count != 3)) { + ret = -EINVAL; + goto clk_unprepare; + } + + eic->irqs[i] = irq.args[1]; + } + + eic->domain = irq_domain_add_hierarchy(parent_domain, 0, MCHP_EIC_NIRQ, + node, &mchp_eic_domain_ops, eic); + if (!eic->domain) { + pr_err("%pOF: Failed to add domain\n", node); + ret = -ENODEV; + goto clk_unprepare; + } + + register_syscore_ops(&mchp_eic_syscore_ops); + + pr_info("%pOF: EIC registered, nr_irqs %u\n", node, MCHP_EIC_NIRQ); + + return 0; + +clk_unprepare: + clk_disable_unprepare(eic->clk); +unmap: + iounmap(eic->base); +free: + kfree(eic); + return ret; +} + +IRQCHIP_PLATFORM_DRIVER_BEGIN(mchp_eic) +IRQCHIP_MATCH("microchip,sama7g5-eic", mchp_eic_init) +IRQCHIP_PLATFORM_DRIVER_END(mchp_eic) + +MODULE_DESCRIPTION("Microchip External Interrupt Controller"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Claudiu Beznea <claudiu.beznea@microchip.com>"); diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index e50676ce2ec8..d90ff0b92480 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -436,8 +436,7 @@ static const struct irq_domain_ops meson_gpio_irq_domain_ops = { .translate = meson_gpio_irq_domain_translate, }; -static int __init meson_gpio_irq_parse_dt(struct device_node *node, - struct meson_gpio_irq_controller *ctl) +static int meson_gpio_irq_parse_dt(struct device_node *node, struct meson_gpio_irq_controller *ctl) { const struct of_device_id *match; int ret; @@ -463,8 +462,7 @@ static int __init meson_gpio_irq_parse_dt(struct device_node *node, return 0; } -static int __init meson_gpio_irq_of_init(struct device_node *node, - struct device_node *parent) +static int meson_gpio_irq_of_init(struct device_node *node, struct device_node *parent) { struct irq_domain *domain, *parent_domain; struct meson_gpio_irq_controller *ctl; @@ -521,5 +519,10 @@ free_ctl: return ret; } -IRQCHIP_DECLARE(meson_gpio_intc, "amlogic,meson-gpio-intc", - meson_gpio_irq_of_init); +IRQCHIP_PLATFORM_DRIVER_BEGIN(meson_gpio_intc) +IRQCHIP_MATCH("amlogic,meson-gpio-intc", meson_gpio_irq_of_init) +IRQCHIP_PLATFORM_DRIVER_END(meson_gpio_intc) + +MODULE_AUTHOR("Jerome Brunet <jbrunet@baylibre.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:meson-gpio-intc"); diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 54c7092cc61d..d02b05a067d9 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -381,24 +381,35 @@ static void gic_unmask_local_irq_all_vpes(struct irq_data *d) spin_unlock_irqrestore(&gic_lock, flags); } -static void gic_all_vpes_irq_cpu_online(struct irq_data *d) +static void gic_all_vpes_irq_cpu_online(void) { - struct gic_all_vpes_chip_data *cd; - unsigned int intr; + static const unsigned int local_intrs[] = { + GIC_LOCAL_INT_TIMER, + GIC_LOCAL_INT_PERFCTR, + GIC_LOCAL_INT_FDC, + }; + unsigned long flags; + int i; - intr = GIC_HWIRQ_TO_LOCAL(d->hwirq); - cd = irq_data_get_irq_chip_data(d); + spin_lock_irqsave(&gic_lock, flags); - write_gic_vl_map(mips_gic_vx_map_reg(intr), cd->map); - if (cd->mask) - write_gic_vl_smask(BIT(intr)); + for (i = 0; i < ARRAY_SIZE(local_intrs); i++) { + unsigned int intr = local_intrs[i]; + struct gic_all_vpes_chip_data *cd; + + cd = &gic_all_vpes_chip_data[intr]; + write_gic_vl_map(mips_gic_vx_map_reg(intr), cd->map); + if (cd->mask) + write_gic_vl_smask(BIT(intr)); + } + + spin_unlock_irqrestore(&gic_lock, flags); } static struct irq_chip gic_all_vpes_local_irq_controller = { .name = "MIPS GIC Local", .irq_mask = gic_mask_local_irq_all_vpes, .irq_unmask = gic_unmask_local_irq_all_vpes, - .irq_cpu_online = gic_all_vpes_irq_cpu_online, }; static void __gic_irq_dispatch(void) @@ -477,6 +488,10 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, intr = GIC_HWIRQ_TO_LOCAL(hwirq); map = GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin; + /* + * If adding support for more per-cpu interrupts, keep the the + * array in gic_all_vpes_irq_cpu_online() in sync. + */ switch (intr) { case GIC_LOCAL_INT_TIMER: /* CONFIG_MIPS_CMP workaround (see __gic_init) */ @@ -663,8 +678,8 @@ static int gic_cpu_startup(unsigned int cpu) /* Clear all local IRQ masks (ie. disable all local interrupts) */ write_gic_vl_rmask(~0); - /* Invoke irq_cpu_online callbacks to enable desired interrupts */ - irq_cpu_online(); + /* Enable desired interrupts */ + gic_all_vpes_irq_cpu_online(); return 0; } diff --git a/drivers/irqchip/irq-mmp.c b/drivers/irqchip/irq-mmp.c index 4a74ac7b7c42..83455ca72439 100644 --- a/drivers/irqchip/irq-mmp.c +++ b/drivers/irqchip/irq-mmp.c @@ -230,7 +230,7 @@ static void __exception_irq_entry mmp_handle_irq(struct pt_regs *regs) if (!(hwirq & SEL_INT_PENDING)) return; hwirq &= SEL_INT_NUM_MASK; - handle_domain_irq(icu_data[0].domain, hwirq, regs); + generic_handle_domain_irq(icu_data[0].domain, hwirq); } static void __exception_irq_entry mmp2_handle_irq(struct pt_regs *regs) @@ -241,7 +241,7 @@ static void __exception_irq_entry mmp2_handle_irq(struct pt_regs *regs) if (!(hwirq & SEL_INT_PENDING)) return; hwirq &= SEL_INT_NUM_MASK; - handle_domain_irq(icu_data[0].domain, hwirq, regs); + generic_handle_domain_irq(icu_data[0].domain, hwirq); } /* MMP (ARMv5) */ diff --git a/drivers/irqchip/irq-mvebu-icu.c b/drivers/irqchip/irq-mvebu-icu.c index 090bc3f4f7d8..3e7297fc5948 100644 --- a/drivers/irqchip/irq-mvebu-icu.c +++ b/drivers/irqchip/irq-mvebu-icu.c @@ -347,7 +347,6 @@ builtin_platform_driver(mvebu_icu_subset_driver); static int mvebu_icu_probe(struct platform_device *pdev) { struct mvebu_icu *icu; - struct resource *res; int i; icu = devm_kzalloc(&pdev->dev, sizeof(struct mvebu_icu), @@ -357,8 +356,7 @@ static int mvebu_icu_probe(struct platform_device *pdev) icu->dev = &pdev->dev; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - icu->base = devm_ioremap_resource(&pdev->dev, res); + icu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(icu->base)) return PTR_ERR(icu->base); diff --git a/drivers/irqchip/irq-mvebu-pic.c b/drivers/irqchip/irq-mvebu-pic.c index dc1cee4b0fe1..870f9866b8da 100644 --- a/drivers/irqchip/irq-mvebu-pic.c +++ b/drivers/irqchip/irq-mvebu-pic.c @@ -121,14 +121,12 @@ static int mvebu_pic_probe(struct platform_device *pdev) struct device_node *node = pdev->dev.of_node; struct mvebu_pic *pic; struct irq_chip *irq_chip; - struct resource *res; pic = devm_kzalloc(&pdev->dev, sizeof(struct mvebu_pic), GFP_KERNEL); if (!pic) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - pic->base = devm_ioremap_resource(&pdev->dev, res); + pic->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pic->base)) return PTR_ERR(pic->base); diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c index d1f5740cd575..55cb6b5a686e 100644 --- a/drivers/irqchip/irq-mxs.c +++ b/drivers/irqchip/irq-mxs.c @@ -136,7 +136,7 @@ asmlinkage void __exception_irq_entry icoll_handle_irq(struct pt_regs *regs) irqnr = __raw_readl(icoll_priv.stat); __raw_writel(irqnr, icoll_priv.vector); - handle_domain_irq(icoll_domain, irqnr, regs); + generic_handle_domain_irq(icoll_domain, irqnr); } static int icoll_irq_domain_map(struct irq_domain *d, unsigned int virq, diff --git a/drivers/irqchip/irq-nvic.c b/drivers/irqchip/irq-nvic.c index b31c4cff4d3a..63bac3f78863 100644 --- a/drivers/irqchip/irq-nvic.c +++ b/drivers/irqchip/irq-nvic.c @@ -37,10 +37,25 @@ static struct irq_domain *nvic_irq_domain; +static void __nvic_handle_irq(irq_hw_number_t hwirq) +{ + generic_handle_domain_irq(nvic_irq_domain, hwirq); +} + +/* + * TODO: restructure the ARMv7M entry logic so that this entry logic can live + * in arch code. + */ asmlinkage void __exception_irq_entry nvic_handle_irq(irq_hw_number_t hwirq, struct pt_regs *regs) { - handle_domain_irq(nvic_irq_domain, hwirq, regs); + struct pt_regs *old_regs; + + irq_enter(); + old_regs = set_irq_regs(regs); + __nvic_handle_irq(hwirq); + set_irq_regs(old_regs); + irq_exit(); } static int nvic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, diff --git a/drivers/irqchip/irq-omap-intc.c b/drivers/irqchip/irq-omap-intc.c index d360a6eddd6d..dc82162ba763 100644 --- a/drivers/irqchip/irq-omap-intc.c +++ b/drivers/irqchip/irq-omap-intc.c @@ -357,7 +357,7 @@ omap_intc_handle_irq(struct pt_regs *regs) } irqnr &= ACTIVEIRQ_MASK; - handle_domain_irq(domain, irqnr, regs); + generic_handle_domain_irq(domain, irqnr); } static int __init intc_of_init(struct device_node *node, diff --git a/drivers/irqchip/irq-or1k-pic.c b/drivers/irqchip/irq-or1k-pic.c index 03d2366118dd..49b47e787644 100644 --- a/drivers/irqchip/irq-or1k-pic.c +++ b/drivers/irqchip/irq-or1k-pic.c @@ -116,7 +116,7 @@ static void or1k_pic_handle_irq(struct pt_regs *regs) int irq = -1; while ((irq = pic_get_irq(irq + 1)) != NO_IRQ) - handle_domain_irq(root_domain, irq, regs); + generic_handle_domain_irq(root_domain, irq); } static int or1k_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) diff --git a/drivers/irqchip/irq-orion.c b/drivers/irqchip/irq-orion.c index b6868f7b805a..17c2c7a07f10 100644 --- a/drivers/irqchip/irq-orion.c +++ b/drivers/irqchip/irq-orion.c @@ -42,8 +42,8 @@ __exception_irq_entry orion_handle_irq(struct pt_regs *regs) gc->mask_cache; while (stat) { u32 hwirq = __fls(stat); - handle_domain_irq(orion_irq_domain, - gc->irq_base + hwirq, regs); + generic_handle_domain_irq(orion_irq_domain, + gc->irq_base + hwirq); stat &= ~(1 << hwirq); } } diff --git a/drivers/irqchip/irq-rda-intc.c b/drivers/irqchip/irq-rda-intc.c index 960168303b73..9f0144a73777 100644 --- a/drivers/irqchip/irq-rda-intc.c +++ b/drivers/irqchip/irq-rda-intc.c @@ -53,7 +53,7 @@ static void __exception_irq_entry rda_handle_irq(struct pt_regs *regs) while (stat) { hwirq = __fls(stat); - handle_domain_irq(rda_irq_domain, hwirq, regs); + generic_handle_domain_irq(rda_irq_domain, hwirq); stat &= ~BIT(hwirq); } } diff --git a/drivers/irqchip/irq-riscv-intc.c b/drivers/irqchip/irq-riscv-intc.c index 8017f6d32d52..b65bd8878d4f 100644 --- a/drivers/irqchip/irq-riscv-intc.c +++ b/drivers/irqchip/irq-riscv-intc.c @@ -37,7 +37,7 @@ static asmlinkage void riscv_intc_irq(struct pt_regs *regs) break; #endif default: - handle_domain_irq(intc_domain, cause, regs); + generic_handle_domain_irq(intc_domain, cause); break; } } diff --git a/drivers/irqchip/irq-sa11x0.c b/drivers/irqchip/irq-sa11x0.c index dbccc7dafbf8..31c202a1ae62 100644 --- a/drivers/irqchip/irq-sa11x0.c +++ b/drivers/irqchip/irq-sa11x0.c @@ -140,8 +140,8 @@ sa1100_handle_irq(struct pt_regs *regs) if (mask == 0) break; - handle_domain_irq(sa1100_normal_irqdomain, - ffs(mask) - 1, regs); + generic_handle_domain_irq(sa1100_normal_irqdomain, + ffs(mask) - 1); } while (1); } diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index 33c76710f845..b7cb2da71888 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -850,7 +850,6 @@ static int stm32_exti_probe(struct platform_device *pdev) struct irq_domain *parent_domain, *domain; struct stm32_exti_host_data *host_data; const struct stm32_exti_drv_data *drv_data; - struct resource *res; host_data = devm_kzalloc(dev, sizeof(*host_data), GFP_KERNEL); if (!host_data) @@ -888,8 +887,7 @@ static int stm32_exti_probe(struct platform_device *pdev) if (!host_data->chips_data) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - host_data->base = devm_ioremap_resource(dev, res); + host_data->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(host_data->base)) return PTR_ERR(host_data->base); diff --git a/drivers/irqchip/irq-sun4i.c b/drivers/irqchip/irq-sun4i.c index 8a315d6a3399..dd506ebfdacb 100644 --- a/drivers/irqchip/irq-sun4i.c +++ b/drivers/irqchip/irq-sun4i.c @@ -195,7 +195,7 @@ static void __exception_irq_entry sun4i_handle_irq(struct pt_regs *regs) return; do { - handle_domain_irq(irq_ic_data->irq_domain, hwirq, regs); + generic_handle_domain_irq(irq_ic_data->irq_domain, hwirq); hwirq = readl(irq_ic_data->irq_base + SUN4I_IRQ_VECTOR_REG) >> 2; } while (hwirq != 0); diff --git a/drivers/irqchip/irq-ti-sci-inta.c b/drivers/irqchip/irq-ti-sci-inta.c index 97f454ec376b..8eba08db33b2 100644 --- a/drivers/irqchip/irq-ti-sci-inta.c +++ b/drivers/irqchip/irq-ti-sci-inta.c @@ -650,7 +650,6 @@ static int ti_sci_inta_irq_domain_probe(struct platform_device *pdev) struct device_node *parent_node, *node; struct ti_sci_inta_irq_domain *inta; struct device *dev = &pdev->dev; - struct resource *res; int ret; node = dev_of_node(dev); @@ -694,8 +693,7 @@ static int ti_sci_inta_irq_domain_probe(struct platform_device *pdev) return PTR_ERR(inta->global_event); } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - inta->base = devm_ioremap_resource(dev, res); + inta->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(inta->base)) return PTR_ERR(inta->base); diff --git a/drivers/irqchip/irq-ts4800.c b/drivers/irqchip/irq-ts4800.c index 34337a61b1ef..f032db23b30f 100644 --- a/drivers/irqchip/irq-ts4800.c +++ b/drivers/irqchip/irq-ts4800.c @@ -93,15 +93,13 @@ static int ts4800_ic_probe(struct platform_device *pdev) struct device_node *node = pdev->dev.of_node; struct ts4800_irq_data *data; struct irq_chip *irq_chip; - struct resource *res; int parent_irq; data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - data->base = devm_ioremap_resource(&pdev->dev, res); + data->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(data->base)) return PTR_ERR(data->base); diff --git a/drivers/irqchip/irq-versatile-fpga.c b/drivers/irqchip/irq-versatile-fpga.c index 75be350cf82f..f2757b6aecc8 100644 --- a/drivers/irqchip/irq-versatile-fpga.c +++ b/drivers/irqchip/irq-versatile-fpga.c @@ -105,7 +105,7 @@ static int handle_one_fpga(struct fpga_irq_data *f, struct pt_regs *regs) while ((status = readl(f->base + IRQ_STATUS))) { irq = ffs(status) - 1; - handle_domain_irq(f->domain, irq, regs); + generic_handle_domain_irq(f->domain, irq); handled = 1; } diff --git a/drivers/irqchip/irq-vic.c b/drivers/irqchip/irq-vic.c index 1e1f2d115257..9e3d5561e04e 100644 --- a/drivers/irqchip/irq-vic.c +++ b/drivers/irqchip/irq-vic.c @@ -208,7 +208,7 @@ static int handle_one_vic(struct vic_device *vic, struct pt_regs *regs) while ((stat = readl_relaxed(vic->base + VIC_IRQ_STATUS))) { irq = ffs(stat) - 1; - handle_domain_irq(vic->domain, irq, regs); + generic_handle_domain_irq(vic->domain, irq); handled = 1; } diff --git a/drivers/irqchip/irq-vt8500.c b/drivers/irqchip/irq-vt8500.c index 5bce936af5d9..e17dd3a8c2d5 100644 --- a/drivers/irqchip/irq-vt8500.c +++ b/drivers/irqchip/irq-vt8500.c @@ -183,7 +183,7 @@ static void __exception_irq_entry vt8500_handle_irq(struct pt_regs *regs) continue; } - handle_domain_irq(intc[i].domain, irqnr, regs); + generic_handle_domain_irq(intc[i].domain, irqnr); } } diff --git a/drivers/irqchip/irq-wpcm450-aic.c b/drivers/irqchip/irq-wpcm450-aic.c index f3ac392d5bc8..0dcbeb1a05a1 100644 --- a/drivers/irqchip/irq-wpcm450-aic.c +++ b/drivers/irqchip/irq-wpcm450-aic.c @@ -69,7 +69,7 @@ static void __exception_irq_entry wpcm450_aic_handle_irq(struct pt_regs *regs) /* Read IPER to signal that nIRQ can be de-asserted */ hwirq = readl(aic->regs + AIC_IPER) / 4; - handle_domain_irq(aic->domain, hwirq, regs); + generic_handle_domain_irq(aic->domain, hwirq); } static void wpcm450_aic_eoi(struct irq_data *d) diff --git a/drivers/irqchip/irq-zevio.c b/drivers/irqchip/irq-zevio.c index 84163f1ebfcf..7a72620fc478 100644 --- a/drivers/irqchip/irq-zevio.c +++ b/drivers/irqchip/irq-zevio.c @@ -50,7 +50,7 @@ static void __exception_irq_entry zevio_handle_irq(struct pt_regs *regs) while (readl(zevio_irq_io + IO_STATUS)) { irqnr = readl(zevio_irq_io + IO_CURRENT); - handle_domain_irq(zevio_irq_domain, irqnr, regs); + generic_handle_domain_irq(zevio_irq_domain, irqnr); } } diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index cb0afe897162..7313454e403a 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -480,6 +480,11 @@ int detach_capi_ctr(struct capi_ctr *ctr) ctr_down(ctr, CAPI_CTR_DETACHED); + if (ctr->cnr < 1 || ctr->cnr - 1 >= CAPI_MAXCONTR) { + err = -EINVAL; + goto unlock_out; + } + if (capi_controller[ctr->cnr - 1] != ctr) { err = -EINVAL; goto unlock_out; diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c index e501cb03f211..bd087cca1c1d 100644 --- a/drivers/isdn/hardware/mISDN/hfcpci.c +++ b/drivers/isdn/hardware/mISDN/hfcpci.c @@ -1994,14 +1994,14 @@ setup_hw(struct hfc_pci *hc) pci_set_master(hc->pdev); if (!hc->irq) { printk(KERN_WARNING "HFC-PCI: No IRQ for PCI card found\n"); - return 1; + return -EINVAL; } hc->hw.pci_io = (char __iomem *)(unsigned long)hc->pdev->resource[1].start; if (!hc->hw.pci_io) { printk(KERN_WARNING "HFC-PCI: No IO-Mem for PCI card found\n"); - return 1; + return -ENOMEM; } /* Allocate memory for FIFOS */ /* the memory needs to be on a 32k boundary within the first 4G */ @@ -2012,7 +2012,7 @@ setup_hw(struct hfc_pci *hc) if (!buffer) { printk(KERN_WARNING "HFC-PCI: Error allocating memory for FIFO!\n"); - return 1; + return -ENOMEM; } hc->hw.fifos = buffer; pci_write_config_dword(hc->pdev, 0x80, hc->hw.dmahandle); @@ -2022,7 +2022,7 @@ setup_hw(struct hfc_pci *hc) "HFC-PCI: Error in ioremap for PCI!\n"); dma_free_coherent(&hc->pdev->dev, 0x8000, hc->hw.fifos, hc->hw.dmahandle); - return 1; + return -ENOMEM; } printk(KERN_INFO diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c index 2a1ddd47a096..a52f275f8263 100644 --- a/drivers/isdn/hardware/mISDN/netjet.c +++ b/drivers/isdn/hardware/mISDN/netjet.c @@ -949,8 +949,8 @@ nj_release(struct tiger_hw *card) nj_disable_hwirq(card); mode_tiger(&card->bc[0], ISDN_P_NONE); mode_tiger(&card->bc[1], ISDN_P_NONE); - card->isac.release(&card->isac); spin_unlock_irqrestore(&card->lock, flags); + card->isac.release(&card->isac); release_region(card->base, card->base_s); card->base_s = 0; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5fc989a6d452..9ed9c955add7 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -178,7 +178,6 @@ #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ -#include <linux/bcache.h> #include <linux/bio.h> #include <linux/kobject.h> #include <linux/list.h> @@ -190,6 +189,7 @@ #include <linux/workqueue.h> #include <linux/kthread.h> +#include "bcache_ondisk.h" #include "bset.h" #include "util.h" #include "closure.h" @@ -395,8 +395,6 @@ struct cached_dev { atomic_t io_errors; unsigned int error_limit; unsigned int offline_seconds; - - char backing_dev_name[BDEVNAME_SIZE]; }; enum alloc_reserve { @@ -470,8 +468,6 @@ struct cache { atomic_long_t meta_sectors_written; atomic_long_t btree_sectors_written; atomic_long_t sectors_written; - - char cache_dev_name[BDEVNAME_SIZE]; }; struct gc_stat { diff --git a/include/uapi/linux/bcache.h b/drivers/md/bcache/bcache_ondisk.h index cf7399f03b71..97413586195b 100644 --- a/include/uapi/linux/bcache.h +++ b/drivers/md/bcache/bcache_ondisk.h @@ -43,9 +43,9 @@ static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \ #define KEY_MAX_U64S 8 KEY_FIELD(KEY_PTRS, high, 60, 3) -KEY_FIELD(HEADER_SIZE, high, 58, 2) +KEY_FIELD(__PAD0, high, 58, 2) KEY_FIELD(KEY_CSUM, high, 56, 2) -KEY_FIELD(KEY_PINNED, high, 55, 1) +KEY_FIELD(__PAD1, high, 55, 1) KEY_FIELD(KEY_DIRTY, high, 36, 1) KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index a50dcfda656f..d795c84246b0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -2,10 +2,10 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H -#include <linux/bcache.h> #include <linux/kernel.h> #include <linux/types.h> +#include "bcache_ondisk.h" #include "util.h" /* for time_stats */ /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 0595559de174..93b67b8d31c3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -141,7 +141,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) uint64_t crc = b->key.ptr[0]; void *data = (void *) i + 8, *end = bset_bkey_last(i); - crc = bch_crc64_update(crc, data, end - data); + crc = crc64_be(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 116edda845c3..6230dfdd9286 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -127,21 +127,20 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) citer.bi_size = UINT_MAX; bio_for_each_segment(bv, bio, iter) { - void *p1 = kmap_atomic(bv.bv_page); + void *p1 = bvec_kmap_local(&bv); void *p2; cbv = bio_iter_iovec(check, citer); - p2 = page_address(cbv.bv_page); + p2 = bvec_kmap_local(&cbv); - cache_set_err_on(memcmp(p1 + bv.bv_offset, - p2 + bv.bv_offset, - bv.bv_len), + cache_set_err_on(memcmp(p1, p2, bv.bv_len), dc->disk.c, - "verify failed at dev %s sector %llu", - dc->backing_dev_name, + "verify failed at dev %pg sector %llu", + dc->bdev, (uint64_t) bio->bi_iter.bi_sector); - kunmap_atomic(p1); + kunmap_local(p2); + kunmap_local(p1); bio_advance_iter(check, &citer, bv.bv_len); } diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c index 6d2b7b84a7b7..634922c5601d 100644 --- a/drivers/md/bcache/features.c +++ b/drivers/md/bcache/features.c @@ -6,7 +6,7 @@ * Copyright 2020 Coly Li <colyli@suse.de> * */ -#include <linux/bcache.h> +#include "bcache_ondisk.h" #include "bcache.h" #include "features.h" diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h index d1c8fd3977fc..09161b89c63e 100644 --- a/drivers/md/bcache/features.h +++ b/drivers/md/bcache/features.h @@ -2,10 +2,11 @@ #ifndef _BCACHE_FEATURES_H #define _BCACHE_FEATURES_H -#include <linux/bcache.h> #include <linux/kernel.h> #include <linux/types.h> +#include "bcache_ondisk.h" + #define BCH_FEATURE_COMPAT 0 #define BCH_FEATURE_RO_COMPAT 1 #define BCH_FEATURE_INCOMPAT 2 diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index e4388fe3ab7e..9c6f9ec55b72 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -65,15 +65,15 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. */ if (bio->bi_opf & REQ_RAHEAD) { - pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore\n", - dc->backing_dev_name); + pr_warn_ratelimited("%pg: Read-ahead I/O failed on backing device, ignore\n", + dc->bdev); return; } errors = atomic_add_return(1, &dc->io_errors); if (errors < dc->error_limit) - pr_err("%s: IO error on backing device, unrecoverable\n", - dc->backing_dev_name); + pr_err("%pg: IO error on backing device, unrecoverable\n", + dc->bdev); else bch_cached_dev_error(dc); } @@ -123,13 +123,13 @@ void bch_count_io_errors(struct cache *ca, errors >>= IO_ERROR_SHIFT; if (errors < ca->set->error_limit) - pr_err("%s: IO error on %s%s\n", - ca->cache_dev_name, m, + pr_err("%pg: IO error on %s%s\n", + ca->bdev, m, is_read ? ", recovering." : "."); else bch_cache_set_error(ca->set, - "%s: too many IO errors %s\n", - ca->cache_dev_name, m); + "%pg: too many IO errors %s\n", + ca->bdev, m); } } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6d1de889baeb..d15aae6c51c1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -46,7 +46,7 @@ static void bio_csum(struct bio *bio, struct bkey *k) bio_for_each_segment(bv, bio, iter) { void *d = kmap(bv.bv_page) + bv.bv_offset; - csum = bch_crc64_update(csum, d, bv.bv_len); + csum = crc64_be(csum, d, bv.bv_len); kunmap(bv.bv_page); } @@ -651,8 +651,8 @@ static void backing_request_endio(struct bio *bio) */ if (unlikely(s->iop.writeback && bio->bi_opf & REQ_PREFLUSH)) { - pr_err("Can't flush %s: returned bi_status %i\n", - dc->backing_dev_name, bio->bi_status); + pr_err("Can't flush %pg: returned bi_status %i\n", + dc->bdev, bio->bi_status); } else { /* set to orig_bio->bi_status in bio_complete() */ s->iop.status = bio->bi_status; @@ -1163,7 +1163,7 @@ static void quit_max_writeback_rate(struct cache_set *c, /* Cached devices - read & write stuff */ -blk_qc_t cached_dev_submit_bio(struct bio *bio) +void cached_dev_submit_bio(struct bio *bio) { struct search *s; struct block_device *orig_bdev = bio->bi_bdev; @@ -1176,7 +1176,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) dc->io_disable)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - return BLK_QC_T_NONE; + return; } if (likely(d->c)) { @@ -1222,8 +1222,6 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } else /* I/O request sent to backing device */ detached_dev_do_request(d, bio, orig_bdev, start_time); - - return BLK_QC_T_NONE; } static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, @@ -1273,7 +1271,7 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -blk_qc_t flash_dev_submit_bio(struct bio *bio) +void flash_dev_submit_bio(struct bio *bio) { struct search *s; struct closure *cl; @@ -1282,7 +1280,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - return BLK_QC_T_NONE; + return; } s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio)); @@ -1298,7 +1296,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) continue_at_nobarrier(&s->cl, flash_dev_nodata, bcache_wq); - return BLK_QC_T_NONE; + return; } else if (bio_data_dir(bio)) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), @@ -1314,7 +1312,6 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) } continue_at(cl, search_free, NULL); - return BLK_QC_T_NONE; } static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 82b38366a95d..38ab4856eaab 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -37,10 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); -blk_qc_t cached_dev_submit_bio(struct bio *bio); +void cached_dev_submit_bio(struct bio *bio); void bch_flash_dev_request_init(struct bcache_device *d); -blk_qc_t flash_dev_submit_bio(struct bio *bio); +void flash_dev_submit_bio(struct bio *bio); extern struct kmem_cache *bch_search_cache; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f2874c77ff79..4a9a65dff95e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1002,7 +1002,7 @@ static void calc_cached_dev_sectors(struct cache_set *c) struct cached_dev *dc; list_for_each_entry(dc, &c->cached_devs, list) - sectors += bdev_sectors(dc->bdev); + sectors += bdev_nr_sectors(dc->bdev); c->cached_dev_sectors = sectors; } @@ -1026,8 +1026,8 @@ static int cached_dev_status_update(void *arg) dc->offline_seconds = 0; if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { - pr_err("%s: device offline for %d seconds\n", - dc->backing_dev_name, + pr_err("%pg: device offline for %d seconds\n", + dc->bdev, BACKING_DEV_OFFLINE_TIMEOUT); pr_err("%s: disable I/O request due to backing device offline\n", dc->disk.name); @@ -1058,15 +1058,13 @@ int bch_cached_dev_run(struct cached_dev *dc) }; if (dc->io_disable) { - pr_err("I/O disabled on cached dev %s\n", - dc->backing_dev_name); + pr_err("I/O disabled on cached dev %pg\n", dc->bdev); ret = -EIO; goto out; } if (atomic_xchg(&dc->running, 1)) { - pr_info("cached dev %s is running already\n", - dc->backing_dev_name); + pr_info("cached dev %pg is running already\n", dc->bdev); ret = -EBUSY; goto out; } @@ -1082,7 +1080,9 @@ int bch_cached_dev_run(struct cached_dev *dc) closure_sync(&cl); } - add_disk(d->disk); + ret = add_disk(d->disk); + if (ret) + goto out; bd_link_disk_holder(dc->bdev, dc->disk.disk); /* * won't show up in the uevent file, use udevadm monitor -e instead @@ -1154,16 +1154,16 @@ static void cached_dev_detach_finish(struct work_struct *w) mutex_lock(&bch_register_lock); - calc_cached_dev_sectors(dc->disk.c); bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); + calc_cached_dev_sectors(dc->disk.c); clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); mutex_unlock(&bch_register_lock); - pr_info("Caching disabled for %s\n", dc->backing_dev_name); + pr_info("Caching disabled for %pg\n", dc->bdev); /* Drop ref we took in cached_dev_detach() */ closure_put(&dc->disk.cl); @@ -1203,29 +1203,27 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return -ENOENT; if (dc->disk.c) { - pr_err("Can't attach %s: already attached\n", - dc->backing_dev_name); + pr_err("Can't attach %pg: already attached\n", dc->bdev); return -EINVAL; } if (test_bit(CACHE_SET_STOPPING, &c->flags)) { - pr_err("Can't attach %s: shutting down\n", - dc->backing_dev_name); + pr_err("Can't attach %pg: shutting down\n", dc->bdev); return -EINVAL; } if (dc->sb.block_size < c->cache->sb.block_size) { /* Will die */ - pr_err("Couldn't attach %s: block size less than set's block size\n", - dc->backing_dev_name); + pr_err("Couldn't attach %pg: block size less than set's block size\n", + dc->bdev); return -EINVAL; } /* Check whether already attached */ list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) { if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) { - pr_err("Tried to attach %s but duplicate UUID already attached\n", - dc->backing_dev_name); + pr_err("Tried to attach %pg but duplicate UUID already attached\n", + dc->bdev); return -EINVAL; } @@ -1243,15 +1241,13 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, if (!u) { if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - pr_err("Couldn't find uuid for %s in set\n", - dc->backing_dev_name); + pr_err("Couldn't find uuid for %pg in set\n", dc->bdev); return -ENOENT; } u = uuid_find_empty(c); if (!u) { - pr_err("Not caching %s, no room for UUID\n", - dc->backing_dev_name); + pr_err("Not caching %pg, no room for UUID\n", dc->bdev); return -EINVAL; } } @@ -1319,8 +1315,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, */ kthread_stop(dc->writeback_thread); cancel_writeback_rate_update_dwork(dc); - pr_err("Couldn't run cached device %s\n", - dc->backing_dev_name); + pr_err("Couldn't run cached device %pg\n", dc->bdev); return ret; } @@ -1336,8 +1331,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, /* Allow the writeback thread to proceed */ up_write(&dc->writeback_lock); - pr_info("Caching %s as %s on set %pU\n", - dc->backing_dev_name, + pr_info("Caching %pg as %s on set %pU\n", + dc->bdev, dc->disk.disk->disk_name, dc->disk.c->set_uuid); return 0; @@ -1461,7 +1456,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct cache_set *c; int ret = -ENOMEM; - bdevname(bdev, dc->backing_dev_name); memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; dc->bdev->bd_holder = dc; @@ -1476,7 +1470,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; - pr_info("registered backing device %s\n", dc->backing_dev_name); + pr_info("registered backing device %pg\n", dc->bdev); list_add(&dc->list, &uncached_devices); /* attach to a matched cache set if it exists */ @@ -1493,7 +1487,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, return 0; err: - pr_notice("error %s: %s\n", dc->backing_dev_name, err); + pr_notice("error %pg: %s\n", dc->bdev, err); bcache_device_stop(&dc->disk); return ret; } @@ -1534,10 +1528,11 @@ static void flash_dev_flush(struct closure *cl) static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) { + int err = -ENOMEM; struct bcache_device *d = kzalloc(sizeof(struct bcache_device), GFP_KERNEL); if (!d) - return -ENOMEM; + goto err_ret; closure_init(&d->cl, NULL); set_closure_fn(&d->cl, flash_dev_flush, system_wq); @@ -1551,9 +1546,12 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) bcache_device_attach(d, c, u - c->uuids); bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); - add_disk(d->disk); + err = add_disk(d->disk); + if (err) + goto err; - if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) + err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"); + if (err) goto err; bcache_device_link(d, c, "volume"); @@ -1567,7 +1565,8 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) return 0; err: kobject_put(&d->kobj); - return -ENOMEM; +err_ret: + return err; } static int flash_devs_run(struct cache_set *c) @@ -1621,8 +1620,8 @@ bool bch_cached_dev_error(struct cached_dev *dc) /* make others know io_disable is true earlier */ smp_mb(); - pr_err("stop %s: too many IO errors on backing device %s\n", - dc->disk.disk->disk_name, dc->backing_dev_name); + pr_err("stop %s: too many IO errors on backing device %pg\n", + dc->disk.disk->disk_name, dc->bdev); bcache_device_stop(&dc->disk); return true; @@ -2338,7 +2337,7 @@ err_btree_alloc: err_free: module_put(THIS_MODULE); if (err) - pr_notice("error %s: %s\n", ca->cache_dev_name, err); + pr_notice("error %pg: %s\n", ca->bdev, err); return ret; } @@ -2348,7 +2347,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, const char *err = NULL; /* must be set for any error case */ int ret = 0; - bdevname(bdev, ca->cache_dev_name); memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; @@ -2390,14 +2388,14 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto out; } - pr_info("registered cache device %s\n", ca->cache_dev_name); + pr_info("registered cache device %pg\n", ca->bdev); out: kobject_put(&ca->kobj); err: if (err) - pr_notice("error %s: %s\n", ca->cache_dev_name, err); + pr_notice("error %pg: %s\n", ca->bdev, err); return ret; } @@ -2617,8 +2615,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); - if (!dc) + if (!dc) { + ret = -ENOMEM; + err = "cannot allocate memory"; goto out_put_sb_page; + } mutex_lock(&bch_register_lock); ret = register_bdev(sb, sb_disk, bdev, dc); @@ -2629,11 +2630,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) + if (!ca) { + ret = -ENOMEM; + err = "cannot allocate memory"; goto out_put_sb_page; + } /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_disk, bdev, ca) != 0) + ret = register_cache(sb, sb_disk, bdev, ca); + if (ret) goto out_free_sb; } @@ -2750,7 +2755,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) * The reason bch_register_lock is not held to call * bch_cache_set_stop() and bcache_device_stop() is to * avoid potential deadlock during reboot, because cache - * set or bcache device stopping process will acqurie + * set or bcache device stopping process will acquire * bch_register_lock too. * * We are safe here because bcache_is_reboot sets to diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 05ac1d6fbbf3..1f0dce30fa75 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -271,7 +271,7 @@ SHOW(__bch_cached_dev) } if (attr == &sysfs_backing_dev_name) { - snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name); + snprintf(buf, BDEVNAME_SIZE + 1, "%pg", dc->bdev); strcat(buf, "\n"); return strlen(buf); } diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 215df32f567b..c1752ba2e05b 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -51,13 +51,27 @@ STORE(fn) \ #define sysfs_printf(file, fmt, ...) \ do { \ if (attr == &sysfs_ ## file) \ - return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ + return sysfs_emit(buf, fmt "\n", __VA_ARGS__); \ } while (0) #define sysfs_print(file, var) \ do { \ if (attr == &sysfs_ ## file) \ - return snprint(buf, PAGE_SIZE, var); \ + return sysfs_emit(buf, \ + __builtin_types_compatible_p(typeof(var), int) \ + ? "%i\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned int) \ + ? "%u\n" : \ + __builtin_types_compatible_p(typeof(var), long) \ + ? "%li\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned long)\ + ? "%lu\n" : \ + __builtin_types_compatible_p(typeof(var), int64_t) \ + ? "%lli\n" : \ + __builtin_types_compatible_p(typeof(var), uint64_t) \ + ? "%llu\n" : \ + __builtin_types_compatible_p(typeof(var), const char *) \ + ? "%s\n" : "%i\n", var); \ } while (0) #define sysfs_hprint(file, val) \ diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index b64460a76267..6f3cb7c92130 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -340,23 +340,6 @@ static inline int bch_strtoul_h(const char *cp, long *res) _r; \ }) -#define snprint(buf, size, var) \ - snprintf(buf, size, \ - __builtin_types_compatible_p(typeof(var), int) \ - ? "%i\n" : \ - __builtin_types_compatible_p(typeof(var), unsigned int) \ - ? "%u\n" : \ - __builtin_types_compatible_p(typeof(var), long) \ - ? "%li\n" : \ - __builtin_types_compatible_p(typeof(var), unsigned long)\ - ? "%lu\n" : \ - __builtin_types_compatible_p(typeof(var), int64_t) \ - ? "%lli\n" : \ - __builtin_types_compatible_p(typeof(var), uint64_t) \ - ? "%llu\n" : \ - __builtin_types_compatible_p(typeof(var), const char *) \ - ? "%s\n" : "%i\n", var) - ssize_t bch_hprint(char *buf, int64_t v); bool bch_is_zero(const char *p, size_t n); @@ -548,14 +531,6 @@ static inline uint64_t bch_crc64(const void *p, size_t len) return crc ^ 0xffffffffffffffffULL; } -static inline uint64_t bch_crc64_update(uint64_t crc, - const void *p, - size_t len) -{ - crc = crc64_be(crc, p, len); - return crc; -} - /* * A stepwise-linear pseudo-exponential. This returns 1 << (x >> * frac_bits), with the less-significant bits filled in by linear @@ -584,8 +559,4 @@ static inline unsigned int fract_exp_two(unsigned int x, void bch_bio_map(struct bio *bio, void *base); int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); -static inline sector_t bdev_sectors(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> 9; -} #endif /* _BCACHE_UTIL_H */ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8120da278161..c7560f66dca8 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -45,7 +45,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * backing volume uses about 2% of the cache for dirty data. */ uint32_t bdev_share = - div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, + div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, c->cached_dev_sectors); uint64_t cache_dirty_target = diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index a3b71350eec8..745e3ab4aa0a 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -8,6 +8,7 @@ #define DM_BIO_RECORD_H #include <linux/bio.h> +#include <linux/blk-integrity.h> /* * There are lots of mutable fields in the bio struct that get diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 50f3e673729c..104ebc1f08dc 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1525,7 +1525,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) { - sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT; + sector_t s = bdev_nr_sectors(c->bdev); if (s >= c->start) s -= c->start; else diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 89a73204dbf4..2874f222c313 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -334,7 +334,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) int r; struct dm_block *sblock; struct cache_disk_superblock *disk_super; - sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; + sector_t bdev_size = bdev_nr_sectors(cmd->bdev); /* FIXME: see if we can lose the max sectors limit */ if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index bdd500447dea..447d030036d1 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1940,7 +1940,7 @@ static void cache_dtr(struct dm_target *ti) static sector_t get_dev_size(struct dm_dev *dev) { - return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; + return bdev_nr_sectors(dev->bdev); } /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 84dbe08ad205..4599632d7a84 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -161,7 +161,7 @@ static const char *clone_device_name(struct clone *clone) static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) { - const char *descs[] = { + static const char * const descs[] = { "read-write", "read-only", "fail" @@ -1514,7 +1514,7 @@ error: static sector_t get_dev_size(struct dm_dev *dev) { - return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; + return bdev_nr_sectors(dev->bdev); } /*---------------------------------------------------------------------------*/ diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 55dccdfbcb22..b855fef4f38a 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,7 +13,7 @@ #include <linux/ktime.h> #include <linux/genhd.h> #include <linux/blk-mq.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> #include <trace/events/block.h> @@ -200,7 +200,7 @@ struct dm_table { struct dm_md_mempools *mempools; #ifdef CONFIG_BLK_INLINE_ENCRYPTION - struct blk_keyslot_manager *ksm; + struct blk_crypto_profile *crypto_profile; #endif }; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 916b7da16de2..292f7896f733 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -15,6 +15,7 @@ #include <linux/key.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/mempool.h> #include <linux/slab.h> #include <linux/crypto.h> diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 3163e2b1418e..03672204b0e3 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -415,7 +415,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, char *result, unsigned int maxlen) { struct dust_device *dd = ti->private; - sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; + sector_t size = bdev_nr_sectors(dd->dev->bdev); bool invalid_msg = false; int r = -EINVAL; unsigned long long tmp, block; @@ -544,8 +544,7 @@ static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) /* * Only pass ioctls through if the device sizes match exactly. */ - if (dd->start || - ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + if (dd->start || ti->len != bdev_nr_sectors(dev->bdev)) return 1; return 0; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index d25989660a76..7ce5d509b940 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -416,7 +416,7 @@ static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) * Only pass ioctls through if the device sizes match exactly. */ *bdev = dev->bdev; - return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT); + return !!(ec->start || ti->len != bdev_nr_sectors(dev->bdev)); } static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 2a78f6874143..1f6bf152b3c7 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1681,7 +1681,7 @@ static int era_message(struct dm_target *ti, unsigned argc, char **argv, static sector_t get_dev_size(struct dm_dev *dev) { - return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; + return bdev_nr_sectors(dev->bdev); } static int era_iterate_devices(struct dm_target *ti, diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 3f4139ac1f60..b5f20eba3641 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -168,7 +168,7 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) */ static inline sector_t get_dev_size(struct block_device *bdev) { - return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + return bdev_nr_sectors(bdev); } static inline chunk_t sector_to_chunk(struct dm_exception_store *store, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 4b94ffe6f2d4..345229d7e59c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -456,8 +456,7 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev /* * Only pass ioctls through if the device sizes match exactly. */ - if (fc->start || - ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) + if (fc->start || ti->len != bdev_nr_sectors((*bdev))) return 1; return 0; } diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index 2c5edfbd7711..957999998d70 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -12,6 +12,7 @@ #include "dm-ima.h" #include <linux/ima.h> +#include <linux/sched/mm.h> #include <crypto/hash.h> #include <linux/crypto.h> #include <crypto/hash_info.h> diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index dc03b70f6e65..d0f788e72abf 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4113,11 +4113,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } - ic->data_device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; + ic->data_device_sectors = bdev_nr_sectors(ic->dev->bdev); if (!ic->meta_dev) ic->meta_device_sectors = ic->data_device_sectors; else - ic->meta_device_sectors = i_size_read(ic->meta_dev->bdev->bd_inode) >> SECTOR_SHIFT; + ic->meta_device_sectors = bdev_nr_sectors(ic->meta_dev->bdev); if (!journal_sectors) { journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, @@ -4367,7 +4367,7 @@ try_smaller_buffer: DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); DEBUG_print(" journal_entries %u\n", ic->journal_entries); DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); - DEBUG_print(" data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT); + DEBUG_print(" data_device_sectors 0x%llx\n", bdev_nr_sectors(ic->dev->bdev)); DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 679b4c0a2eea..66ba16713f69 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -135,8 +135,7 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev /* * Only pass ioctls through if the device sizes match exactly. */ - if (lc->start || - ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + if (lc->start || ti->len != bdev_nr_sectors(dev->bdev)) return 1; return 0; } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index d93a4db23512..46de085a9670 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -446,7 +446,7 @@ static int log_super(struct log_writes_c *lc) static inline sector_t logdev_last_sector(struct log_writes_c *lc) { - return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT; + return bdev_nr_sectors(lc->logdev->bdev); } static int log_writes_kthread(void *arg) @@ -851,7 +851,7 @@ static int log_writes_prepare_ioctl(struct dm_target *ti, /* * Only pass ioctls through if the device sizes match exactly. */ - if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + if (ti->len != bdev_nr_sectors(dev->bdev)) return 1; return 0; } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 1ecf75ef276a..06f328928a7f 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -447,7 +447,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, bdev_logical_block_size(lc->header_location. bdev)); - if (buf_size > i_size_read(dev->bdev->bd_inode)) { + if (buf_size > bdev_nr_bytes(dev->bdev)) { DMWARN("log device %s too small: need %llu bytes", dev->name, (unsigned long long)buf_size); kfree(lc); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 694aaca4eea2..90dc9cc48881 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -530,7 +530,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, bdev = pgpath->path.dev->bdev; q = bdev_get_queue(bdev); - clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, + clone = blk_mq_alloc_request(q, rq->cmd_flags | REQ_NOMERGE, BLK_MQ_REQ_NOWAIT); if (IS_ERR(clone)) { /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ @@ -579,7 +579,7 @@ static void multipath_release_clone(struct request *clone, clone->io_start_time_ns); } - blk_put_request(clone); + blk_mq_free_request(clone); } /* @@ -2061,7 +2061,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, /* * Only pass ioctls through if the device sizes match exactly. */ - if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) + if (!r && ti->len != bdev_nr_sectors((*bdev))) return 1; return r; } diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index 1856a1b125cc..875bca30a0dd 100644 --- a/drivers/md/dm-ps-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c @@ -27,6 +27,7 @@ #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/sched/clock.h> #define DM_MSG_PREFIX "multipath historical-service-time" diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index d9ef52159a22..2b26435a6946 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1261,7 +1261,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, md_rdev_init(jdev); jdev->mddev = &rs->md; jdev->bdev = rs->journal_dev.dev->bdev; - jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode)); + jdev->sectors = bdev_nr_sectors(jdev->bdev); if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) { rs->ti->error = "No space for raid4/5/6 journal"; return -ENOSPC; @@ -1607,7 +1607,7 @@ static int _check_data_dev_sectors(struct raid_set *rs) rdev_for_each(rdev, &rs->md) if (!test_bit(Journal, &rdev->flags) && rdev->bdev) { - ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode))); + ds = min(ds, bdev_nr_sectors(rdev->bdev)); if (ds < rs->md.dev_sectors) { rs->ti->error = "Component device(s) too small"; return -EINVAL; @@ -2662,7 +2662,7 @@ static int rs_adjust_data_offsets(struct raid_set *rs) * Make sure we got a minimum amount of free sectors per device */ if (rs->data_offset && - to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) { + bdev_nr_sectors(rdev->bdev) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) { rs->ti->error = data_offset ? "No space for forward reshape" : "No space for backward reshape"; return -ENOSPC; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 5b95eea517d1..579ab6183d4d 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -7,7 +7,6 @@ #include "dm-core.h" #include "dm-rq.h" -#include <linux/elevator.h> /* for rq_end_sector() */ #include <linux/blk-mq.h> #define DM_MSG_PREFIX "core-rq" @@ -490,6 +489,14 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct mapped_device *md = tio->md; struct dm_target *ti = md->immutable_target; + /* + * blk-mq's unquiesce may come from outside events, such as + * elevator switch, updating nr_requests or others, and request may + * come during suspend, so simply ask for blk-mq to requeue it. + */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) + return BLK_STS_RESOURCE; + if (unlikely(!ti)) { int srcu_idx; struct dm_table *map = dm_get_live_table(md, &srcu_idx); diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 028a92ff6d57..534dc2ca8bb0 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -529,7 +529,7 @@ static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev * Only pass ioctls through if the device sizes match exactly. */ if (ti->len + sctx->path_list[path_nr].start != - i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) + bdev_nr_sectors((*bdev))) return 1; return 0; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2111daaacaba..bcddc5effd15 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/string.h> @@ -169,7 +170,7 @@ static void free_devices(struct list_head *devices, struct mapped_device *md) } } -static void dm_table_destroy_keyslot_manager(struct dm_table *t); +static void dm_table_destroy_crypto_profile(struct dm_table *t); void dm_table_destroy(struct dm_table *t) { @@ -199,7 +200,7 @@ void dm_table_destroy(struct dm_table *t) dm_free_md_mempools(t->mempools); - dm_table_destroy_keyslot_manager(t); + dm_table_destroy_crypto_profile(t); kfree(t); } @@ -226,8 +227,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, { struct queue_limits *limits = data; struct block_device *bdev = dev->bdev; - sector_t dev_size = - i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + sector_t dev_size = bdev_nr_sectors(bdev); unsigned short logical_block_size_sectors = limits->logical_block_size >> SECTOR_SHIFT; char b[BDEVNAME_SIZE]; @@ -1186,8 +1186,8 @@ static int dm_table_register_integrity(struct dm_table *t) #ifdef CONFIG_BLK_INLINE_ENCRYPTION -struct dm_keyslot_manager { - struct blk_keyslot_manager ksm; +struct dm_crypto_profile { + struct blk_crypto_profile profile; struct mapped_device *md; }; @@ -1213,13 +1213,11 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, * When an inline encryption key is evicted from a device-mapper device, evict * it from all the underlying devices. */ -static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, +static int dm_keyslot_evict(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct dm_keyslot_manager *dksm = container_of(ksm, - struct dm_keyslot_manager, - ksm); - struct mapped_device *md = dksm->md; + struct mapped_device *md = + container_of(profile, struct dm_crypto_profile, profile)->md; struct dm_keyslot_evict_args args = { key }; struct dm_table *t; int srcu_idx; @@ -1239,150 +1237,148 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, return args.err; } -static const struct blk_ksm_ll_ops dm_ksm_ll_ops = { - .keyslot_evict = dm_keyslot_evict, -}; - -static int device_intersect_crypto_modes(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) +static int +device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - struct blk_keyslot_manager *parent = data; - struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm; + struct blk_crypto_profile *parent = data; + struct blk_crypto_profile *child = + bdev_get_queue(dev->bdev)->crypto_profile; - blk_ksm_intersect_modes(parent, child); + blk_crypto_intersect_capabilities(parent, child); return 0; } -void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) { - struct dm_keyslot_manager *dksm = container_of(ksm, - struct dm_keyslot_manager, - ksm); + struct dm_crypto_profile *dmcp = container_of(profile, + struct dm_crypto_profile, + profile); - if (!ksm) + if (!profile) return; - blk_ksm_destroy(ksm); - kfree(dksm); + blk_crypto_profile_destroy(profile); + kfree(dmcp); } -static void dm_table_destroy_keyslot_manager(struct dm_table *t) +static void dm_table_destroy_crypto_profile(struct dm_table *t) { - dm_destroy_keyslot_manager(t->ksm); - t->ksm = NULL; + dm_destroy_crypto_profile(t->crypto_profile); + t->crypto_profile = NULL; } /* - * Constructs and initializes t->ksm with a keyslot manager that - * represents the common set of crypto capabilities of the devices - * described by the dm_table. However, if the constructed keyslot - * manager does not support a superset of the crypto capabilities - * supported by the current keyslot manager of the mapped_device, - * it returns an error instead, since we don't support restricting - * crypto capabilities on table changes. Finally, if the constructed - * keyslot manager doesn't actually support any crypto modes at all, - * it just returns NULL. + * Constructs and initializes t->crypto_profile with a crypto profile that + * represents the common set of crypto capabilities of the devices described by + * the dm_table. However, if the constructed crypto profile doesn't support all + * crypto capabilities that are supported by the current mapped_device, it + * returns an error instead, since we don't support removing crypto capabilities + * on table changes. Finally, if the constructed crypto profile is "empty" (has + * no crypto capabilities at all), it just sets t->crypto_profile to NULL. */ -static int dm_table_construct_keyslot_manager(struct dm_table *t) +static int dm_table_construct_crypto_profile(struct dm_table *t) { - struct dm_keyslot_manager *dksm; - struct blk_keyslot_manager *ksm; + struct dm_crypto_profile *dmcp; + struct blk_crypto_profile *profile; struct dm_target *ti; unsigned int i; - bool ksm_is_empty = true; + bool empty_profile = true; - dksm = kmalloc(sizeof(*dksm), GFP_KERNEL); - if (!dksm) + dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL); + if (!dmcp) return -ENOMEM; - dksm->md = t->md; + dmcp->md = t->md; - ksm = &dksm->ksm; - blk_ksm_init_passthrough(ksm); - ksm->ksm_ll_ops = dm_ksm_ll_ops; - ksm->max_dun_bytes_supported = UINT_MAX; - memset(ksm->crypto_modes_supported, 0xFF, - sizeof(ksm->crypto_modes_supported)); + profile = &dmcp->profile; + blk_crypto_profile_init(profile, 0); + profile->ll_ops.keyslot_evict = dm_keyslot_evict; + profile->max_dun_bytes_supported = UINT_MAX; + memset(profile->modes_supported, 0xFF, + sizeof(profile->modes_supported)); for (i = 0; i < dm_table_get_num_targets(t); i++) { ti = dm_table_get_target(t, i); if (!dm_target_passes_crypto(ti->type)) { - blk_ksm_intersect_modes(ksm, NULL); + blk_crypto_intersect_capabilities(profile, NULL); break; } if (!ti->type->iterate_devices) continue; - ti->type->iterate_devices(ti, device_intersect_crypto_modes, - ksm); + ti->type->iterate_devices(ti, + device_intersect_crypto_capabilities, + profile); } - if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) { + if (t->md->queue && + !blk_crypto_has_capabilities(profile, + t->md->queue->crypto_profile)) { DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); - dm_destroy_keyslot_manager(ksm); + dm_destroy_crypto_profile(profile); return -EINVAL; } /* - * If the new KSM doesn't actually support any crypto modes, we may as - * well represent it with a NULL ksm. + * If the new profile doesn't actually support any crypto capabilities, + * we may as well represent it with a NULL profile. */ - ksm_is_empty = true; - for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) { - if (ksm->crypto_modes_supported[i]) { - ksm_is_empty = false; + for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) { + if (profile->modes_supported[i]) { + empty_profile = false; break; } } - if (ksm_is_empty) { - dm_destroy_keyslot_manager(ksm); - ksm = NULL; + if (empty_profile) { + dm_destroy_crypto_profile(profile); + profile = NULL; } /* - * t->ksm is only set temporarily while the table is being set - * up, and it gets set to NULL after the capabilities have - * been transferred to the request_queue. + * t->crypto_profile is only set temporarily while the table is being + * set up, and it gets set to NULL after the profile has been + * transferred to the request_queue. */ - t->ksm = ksm; + t->crypto_profile = profile; return 0; } -static void dm_update_keyslot_manager(struct request_queue *q, - struct dm_table *t) +static void dm_update_crypto_profile(struct request_queue *q, + struct dm_table *t) { - if (!t->ksm) + if (!t->crypto_profile) return; - /* Make the ksm less restrictive */ - if (!q->ksm) { - blk_ksm_register(t->ksm, q); + /* Make the crypto profile less restrictive. */ + if (!q->crypto_profile) { + blk_crypto_register(t->crypto_profile, q); } else { - blk_ksm_update_capabilities(q->ksm, t->ksm); - dm_destroy_keyslot_manager(t->ksm); + blk_crypto_update_capabilities(q->crypto_profile, + t->crypto_profile); + dm_destroy_crypto_profile(t->crypto_profile); } - t->ksm = NULL; + t->crypto_profile = NULL; } #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static int dm_table_construct_keyslot_manager(struct dm_table *t) +static int dm_table_construct_crypto_profile(struct dm_table *t) { return 0; } -void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) { } -static void dm_table_destroy_keyslot_manager(struct dm_table *t) +static void dm_table_destroy_crypto_profile(struct dm_table *t) { } -static void dm_update_keyslot_manager(struct request_queue *q, - struct dm_table *t) +static void dm_update_crypto_profile(struct request_queue *q, + struct dm_table *t) { } @@ -1414,9 +1410,9 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_construct_keyslot_manager(t); + r = dm_table_construct_crypto_profile(t); if (r) { - DMERR("could not construct keyslot manager."); + DMERR("could not construct crypto profile."); return r; } @@ -2070,7 +2066,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, return r; } - dm_update_keyslot_manager(q, t); + dm_update_crypto_profile(q, t); disk_update_readahead(t->md->disk); return 0; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index c88ed14d49e6..1a96a07cbf44 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -549,7 +549,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) int r; struct dm_block *sblock; struct thin_disk_superblock *disk_super; - sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT; + sector_t bdev_size = bdev_nr_sectors(pmd->bdev); if (bdev_size > THIN_METADATA_MAX_SECTORS) bdev_size = THIN_METADATA_MAX_SECTORS; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4c67b77c23c1..ec119d2422d5 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3212,7 +3212,7 @@ static int metadata_pre_commit_callback(void *context) static sector_t get_dev_size(struct block_device *bdev) { - return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + return bdev_nr_sectors(bdev); } static void warn_if_metadata_device_too_big(struct block_device *bdev) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 22a5ac82446a..a7efe83aad29 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -18,6 +18,7 @@ #include "dm-verity-verify-sig.h" #include <linux/module.h> #include <linux/reboot.h> +#include <linux/scatterlist.h> #define DM_MSG_PREFIX "verity" @@ -475,6 +476,7 @@ static int verity_verify_io(struct dm_verity_io *io) struct bvec_iter start; unsigned b; struct crypto_wait wait; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); for (b = 0; b < io->n_blocks; b++) { int r; @@ -529,9 +531,17 @@ static int verity_verify_io(struct dm_verity_io *io) else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, cur_block, NULL, &start) == 0) continue; - else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - cur_block)) - return -EIO; + else { + if (bio->bi_status) { + /* + * Error correction failed; Just return error + */ + return -EIO; + } + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + cur_block)) + return -EIO; + } } return 0; @@ -824,8 +834,7 @@ static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev *bdev = v->data_dev->bdev; - if (v->data_start || - ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) + if (v->data_start || ti->len != bdev_nr_sectors(v->data_dev->bdev)) return 1; return 0; } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 18320444fb0a..017806096b91 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -2341,7 +2341,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Cache data device lookup failed"; goto bad; } - wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode); + wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); /* * Parse the cache block size diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index ae1bc48c0043..8dc21c09329f 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -733,7 +733,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path, dev->dev_idx = idx; (void)bdevname(dev->bdev, dev->name); - dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + dev->capacity = bdev_nr_sectors(bdev); if (ti->begin) { ti->error = "Partial mapping is not supported"; goto err; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a011d09cb0fa..63aa52263658 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -29,7 +29,7 @@ #include <linux/refcount.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> #define DM_MSG_PREFIX "core" @@ -496,18 +496,17 @@ static void start_io_acct(struct dm_io *io) false, 0, &io->stats_aux); } -static void end_io_acct(struct dm_io *io) +static void end_io_acct(struct mapped_device *md, struct bio *bio, + unsigned long start_time, struct dm_stats_aux *stats_aux) { - struct mapped_device *md = io->md; - struct bio *bio = io->orig_bio; - unsigned long duration = jiffies - io->start_time; + unsigned long duration = jiffies - start_time; - bio_end_io_acct(bio, io->start_time); + bio_end_io_acct(bio, start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), - true, duration, &io->stats_aux); + true, duration, stats_aux); /* nudge anyone waiting on suspend queue */ if (unlikely(wq_has_sleeper(&md->wait))) @@ -790,6 +789,8 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error) blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; + unsigned long start_time = 0; + struct dm_stats_aux stats_aux; /* Push-back supersedes any I/O errors */ if (unlikely(error)) { @@ -821,8 +822,10 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error) } io_error = io->status; - end_io_acct(io); + start_time = io->start_time; + stats_aux = io->stats_aux; free_io(md, io); + end_io_acct(md, bio, start_time, &stats_aux); if (io_error == BLK_STS_DM_REQUEUE) return; @@ -1180,14 +1183,13 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) mutex_unlock(&md->swap_bios_lock); } -static blk_qc_t __map_bio(struct dm_target_io *tio) +static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; struct bio *clone = &tio->clone; struct dm_io *io = tio->io; struct dm_target *ti = tio->ti; - blk_qc_t ret = BLK_QC_T_NONE; clone->bi_end_io = clone_endio; @@ -1223,7 +1225,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector); - ret = submit_bio_noacct(clone); + submit_bio_noacct(clone); break; case DM_MAPIO_KILL: if (unlikely(swap_bios_limit(ti, clone))) { @@ -1245,8 +1247,6 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) DMWARN("unimplemented target map return value: %d", r); BUG(); } - - return ret; } static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) @@ -1333,7 +1333,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } } -static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci, +static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target_io *tio, unsigned *len) { struct bio *clone = &tio->clone; @@ -1343,8 +1343,7 @@ static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci, __bio_clone_fast(clone, ci->bio); if (len) bio_setup_sector(clone, ci->sector, *len); - - return __map_bio(tio); + __map_bio(tio); } static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, @@ -1358,7 +1357,7 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, while ((bio = bio_list_pop(&blist))) { tio = container_of(bio, struct dm_target_io, clone); - (void) __clone_and_map_simple_bio(ci, tio, len); + __clone_and_map_simple_bio(ci, tio, len); } } @@ -1402,7 +1401,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, free_tio(tio); return r; } - (void) __map_bio(tio); + __map_bio(tio); return 0; } @@ -1517,11 +1516,10 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md, /* * Entry point to split a bio into clones and submit them to the targets. */ -static blk_qc_t __split_and_process_bio(struct mapped_device *md, +static void __split_and_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio) { struct clone_info ci; - blk_qc_t ret = BLK_QC_T_NONE; int error = 0; init_clone_info(&ci, md, map, bio); @@ -1564,19 +1562,17 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, bio_chain(b, bio); trace_block_split(b, bio->bi_iter.bi_sector); - ret = submit_bio_noacct(bio); + submit_bio_noacct(bio); } } /* drop the extra reference count */ dm_io_dec_pending(ci.io, errno_to_blk_status(error)); - return ret; } -static blk_qc_t dm_submit_bio(struct bio *bio) +static void dm_submit_bio(struct bio *bio) { struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; - blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; struct dm_table *map; @@ -1606,10 +1602,9 @@ static blk_qc_t dm_submit_bio(struct bio *bio) if (is_abnormal_io(bio)) blk_queue_split(&bio); - ret = __split_and_process_bio(md, map, bio); + __split_and_process_bio(md, map, bio); out: dm_put_live_table(md, srcu_idx); - return ret; } /*----------------------------------------------------------------- @@ -1668,14 +1663,14 @@ static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); #ifdef CONFIG_BLK_INLINE_ENCRYPTION -static void dm_queue_destroy_keyslot_manager(struct request_queue *q) +static void dm_queue_destroy_crypto_profile(struct request_queue *q) { - dm_destroy_keyslot_manager(q->ksm); + dm_destroy_crypto_profile(q->crypto_profile); } #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q) +static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) { } #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ @@ -1701,7 +1696,7 @@ static void cleanup_mapped_device(struct mapped_device *md) dm_sysfs_exit(md); del_gendisk(md->disk); } - dm_queue_destroy_keyslot_manager(md->queue); + dm_queue_destroy_crypto_profile(md->queue); blk_cleanup_disk(md->disk); } @@ -2083,7 +2078,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) if (r) return r; - add_disk(md->disk); + r = add_disk(md->disk); + if (r) + return r; r = dm_sysfs_init(md); if (r) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 6c0c3d0d905a..5111ed966947 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -41,6 +41,7 @@ #include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/badblocks.h> #include <linux/sysctl.h> #include <linux/seq_file.h> @@ -51,6 +52,7 @@ #include <linux/hdreg.h> #include <linux/proc_fs.h> #include <linux/random.h> +#include <linux/major.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/file.h> @@ -352,7 +354,7 @@ static bool create_on_open = true; */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -void md_new_event(struct mddev *mddev) +void md_new_event(void) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -441,19 +443,19 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); -static blk_qc_t md_submit_bio(struct bio *bio) +static void md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); - return BLK_QC_T_NONE; + return; } if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); - return BLK_QC_T_NONE; + return; } blk_queue_split(&bio); @@ -462,15 +464,13 @@ static blk_qc_t md_submit_bio(struct bio *bio) if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - return BLK_QC_T_NONE; + return; } /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); - - return BLK_QC_T_NONE; } /* mddev_suspend makes sure no new requests are submitted @@ -888,8 +888,7 @@ static struct md_personality *find_pers(int level, char *clevel) /* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { - sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; - return MD_NEW_SIZE_SECTORS(num_sectors); + return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); } static int alloc_disk_sb(struct md_rdev *rdev) @@ -1631,8 +1630,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ */ switch(minor_version) { case 0: - sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; - sb_start -= 8*2; + sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; sb_start &= ~(sector_t)(4*2-1); break; case 1: @@ -1787,10 +1785,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ else ret = 0; } - if (minor_version) { - sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); - sectors -= rdev->data_offset; - } else + if (minor_version) + sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; + else sectors = rdev->sb_start; if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; @@ -2168,8 +2165,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ - max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; - max_sectors -= rdev->data_offset; + max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; } else if (rdev->mddev->bitmap_info.offset) { @@ -2178,7 +2174,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) } else { /* minor version 0; superblock after data */ sector_t sb_start, bm_space; - sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9; + sector_t dev_size = bdev_nr_sectors(rdev->bdev); /* 8K is for superblock */ sb_start = dev_size - 8*2; @@ -2886,7 +2882,7 @@ static int add_bound_rdev(struct md_rdev *rdev) if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_new_event(mddev); + md_new_event(); md_wakeup_thread(mddev->thread); return 0; } @@ -2976,7 +2972,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * -write_error - clears WriteErrorSeen * {,-}failfast - set/clear FailFast */ + + struct mddev *mddev = rdev->mddev; int err = -EINVAL; + bool need_update_sb = false; + if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); if (test_bit(Faulty, &rdev->flags)) @@ -2991,7 +2991,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->raid_disk >= 0) err = -EBUSY; else { - struct mddev *mddev = rdev->mddev; err = 0; if (mddev_is_clustered(mddev)) err = md_cluster_ops->remove_disk(mddev, rdev); @@ -3002,16 +3001,18 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); } - md_new_event(mddev); + md_new_event(); } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); mddev_create_serial_pool(rdev->mddev, rdev, false); + need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { mddev_destroy_serial_pool(rdev->mddev, rdev, false); clear_bit(WriteMostly, &rdev->flags); + need_update_sb = true; err = 0; } else if (cmd_match(buf, "blocked")) { set_bit(Blocked, &rdev->flags); @@ -3037,9 +3038,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = 0; } else if (cmd_match(buf, "failfast")) { set_bit(FailFast, &rdev->flags); + need_update_sb = true; err = 0; } else if (cmd_match(buf, "-failfast")) { clear_bit(FailFast, &rdev->flags); + need_update_sb = true; err = 0; } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags)) { @@ -3118,6 +3121,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) clear_bit(ExternalBbl, &rdev->flags); err = 0; } + if (need_update_sb) + md_update_sb(mddev, 1); if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; @@ -3382,7 +3387,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) if (!sectors) return -EBUSY; } else if (!sectors) - sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - + sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!my_mddev->pers->resize) /* Cannot change size for RAID0 or Linear etc */ @@ -3709,7 +3714,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe kobject_init(&rdev->kobj, &rdev_ktype); - size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; + size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; if (!size) { pr_warn("md: %s has zero or unknown size, marking faulty!\n", bdevname(rdev->bdev,b)); @@ -4099,7 +4104,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); - md_new_event(mddev); + md_new_event(); rv = len; out_unlock: mddev_unlock(mddev); @@ -4620,7 +4625,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) export_rdev(rdev); mddev_unlock(mddev); if (!err) - md_new_event(mddev); + md_new_event(); return err ? err : len; } @@ -5490,6 +5495,10 @@ static struct attribute *md_default_attrs[] = { NULL, }; +static const struct attribute_group md_default_group = { + .attrs = md_default_attrs, +}; + static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, &md_last_scan_mode.attr, @@ -5512,6 +5521,12 @@ static const struct attribute_group md_redundancy_group = { .attrs = md_redundancy_attrs, }; +static const struct attribute_group *md_attr_groups[] = { + &md_default_group, + &md_bitmap_group, + NULL, +}; + static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -5587,7 +5602,7 @@ static const struct sysfs_ops md_sysfs_ops = { static struct kobj_type md_ktype = { .release = md_free, .sysfs_ops = &md_sysfs_ops, - .default_attrs = md_default_attrs, + .default_groups = md_attr_groups, }; int mdp_major = 0; @@ -5596,7 +5611,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - sysfs_remove_group(&mddev->kobj, &md_bitmap_group); kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } @@ -5663,7 +5677,7 @@ static int md_alloc(dev_t dev, char *name) strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; - goto abort; + goto out_unlock_disks_mutex; } spin_unlock(&all_mddevs_lock); } @@ -5676,7 +5690,7 @@ static int md_alloc(dev_t dev, char *name) error = -ENOMEM; disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) - goto abort; + goto out_unlock_disks_mutex; disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -5700,27 +5714,25 @@ static int md_alloc(dev_t dev, char *name) disk->flags |= GENHD_FL_EXT_DEVT; disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; - add_disk(disk); + error = add_disk(disk); + if (error) + goto out_cleanup_disk; error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); - if (error) { - /* This isn't possible, but as kobject_init_and_add is marked - * __must_check, we must do something with the result - */ - pr_debug("md: cannot register %s/md - name in use\n", - disk->disk_name); - error = 0; - } - if (mddev->kobj.sd && - sysfs_create_group(&mddev->kobj, &md_bitmap_group)) - pr_debug("pointless warning\n"); - abort: + if (error) + goto out_del_gendisk; + + kobject_uevent(&mddev->kobj, KOBJ_ADD); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); + goto out_unlock_disks_mutex; + +out_del_gendisk: + del_gendisk(disk); +out_cleanup_disk: + blk_cleanup_disk(disk); +out_unlock_disks_mutex: mutex_unlock(&disks_mutex); - if (!error && mddev->kobj.sd) { - kobject_uevent(&mddev->kobj, KOBJ_ADD); - mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); - mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); - } mddev_put(mddev); return error; } @@ -6034,7 +6046,7 @@ int md_run(struct mddev *mddev) if (mddev->sb_flags) md_update_sb(mddev, 0); - md_new_event(mddev); + md_new_event(); return 0; bitmap_abort: @@ -6424,7 +6436,7 @@ static int do_md_stop(struct mddev *mddev, int mode, if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; } - md_new_event(mddev); + md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } @@ -6880,7 +6892,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (!mddev->persistent) { pr_debug("md: nonpersistent superblock ...\n"); - rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + rdev->sb_start = bdev_nr_sectors(rdev->bdev); } else rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; @@ -6928,7 +6940,7 @@ kick_rdev: md_wakeup_thread(mddev->thread); else md_update_sb(mddev, 1); - md_new_event(mddev); + md_new_event(); return 0; busy: @@ -6967,7 +6979,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) if (mddev->persistent) rdev->sb_start = calc_dev_sboffset(rdev); else - rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + rdev->sb_start = bdev_nr_sectors(rdev->bdev); rdev->sectors = rdev->sb_start; @@ -7001,7 +7013,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - md_new_event(mddev); + md_new_event(); return 0; abort_export: @@ -7975,7 +7987,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) md_wakeup_thread(mddev->thread); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); - md_new_event(mddev); + md_new_event(); } EXPORT_SYMBOL(md_error); @@ -8859,7 +8871,7 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); - md_new_event(mddev); + md_new_event(); update_time = jiffies; blk_start_plug(&plug); @@ -8930,7 +8942,7 @@ void md_do_sync(struct md_thread *thread) /* this is the earliest that rebuild will be * visible in /proc/mdstat */ - md_new_event(mddev); + md_new_event(); if (last_check + window > io_sectors || j == max_sectors) continue; @@ -9154,7 +9166,7 @@ static int remove_and_add_spares(struct mddev *mddev, sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags)) spares++; - md_new_event(mddev); + md_new_event(); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } @@ -9188,7 +9200,7 @@ static void md_start_sync(struct work_struct *ws) } else md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + md_new_event(); } /* @@ -9447,7 +9459,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + md_new_event(); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 4c96c36bd01a..53ea7a6961de 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -731,7 +731,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op); extern void md_do_sync(struct md_thread *thread); -extern void md_new_event(struct mddev *mddev); +extern void md_new_event(void); extern void md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 19598bd38939..7dc8026cf6ee 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1496,7 +1496,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!r1_bio->bios[i]) continue; - if (first_clone) { + if (first_clone && test_bit(WriteMostly, &rdev->flags)) { /* do behind I/O ? * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly @@ -1529,13 +1529,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->bios[i] = mbio; - mbio->bi_iter.bi_sector = (r1_bio->sector + - conf->mirrors[i].rdev->data_offset); - bio_set_dev(mbio, conf->mirrors[i].rdev->bdev); + mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); + bio_set_dev(mbio, rdev->bdev); mbio->bi_end_io = raid1_end_write_request; mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); - if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && - !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && + if (test_bit(FailFast, &rdev->flags) && + !test_bit(WriteMostly, &rdev->flags) && conf->raid_disks - mddev->degraded > 1) mbio->bi_opf |= MD_FAILFAST; mbio->bi_private = r1_bio; @@ -1546,7 +1545,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void *)conf->mirrors[i].rdev; + mbio->bi_bdev = (void *)rdev; cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); if (cb) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index aa2636582841..dde98f65bd04 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4647,7 +4647,7 @@ out: } conf->reshape_checkpoint = jiffies; md_wakeup_thread(mddev->sync_thread); - md_new_event(mddev); + md_new_event(); return 0; abort: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 02ed53b20654..9c1a5877cf9f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7732,10 +7732,7 @@ static int raid5_run(struct mddev *mddev) * discard data disk but write parity disk */ stripe = stripe * PAGE_SIZE; - /* Round up to power of 2, as discard handling - * currently assumes that */ - while ((stripe-1) & stripe) - stripe = (stripe | (stripe-1)) + 1; + stripe = roundup_pow_of_two(stripe); mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; @@ -8282,7 +8279,7 @@ static int raid5_start_reshape(struct mddev *mddev) } conf->reshape_checkpoint = jiffies; md_wakeup_thread(mddev->sync_thread); - md_new_event(mddev); + md_new_event(); return 0; } diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig index 157c924686e4..80321e03809a 100644 --- a/drivers/media/platform/Kconfig +++ b/drivers/media/platform/Kconfig @@ -565,7 +565,7 @@ config VIDEO_QCOM_VENUS depends on VIDEO_DEV && VIDEO_V4L2 && QCOM_SMEM depends on (ARCH_QCOM && IOMMU_DMA) || COMPILE_TEST select QCOM_MDT_LOADER if ARCH_QCOM - select QCOM_SCM if ARCH_QCOM + select QCOM_SCM select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV help diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 85ba901bc11b..0f5a49fc7c9e 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -224,6 +224,7 @@ config HI6421V600_IRQ tristate "HiSilicon Hi6421v600 IRQ and powerkey" depends on OF depends on SPMI + depends on HAS_IOMEM select MFD_CORE select REGMAP_SPMI help diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c index e5a4ed3701eb..a798fad5f03c 100644 --- a/drivers/misc/cb710/sgbuf2.c +++ b/drivers/misc/cb710/sgbuf2.c @@ -47,7 +47,7 @@ static inline bool needs_unaligned_copy(const void *ptr) #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS return false; #else - return ((ptr - NULL) & 3) != 0; + return ((uintptr_t)ptr & 3) != 0; #endif } diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c index 4d09b672ac3c..632325474233 100644 --- a/drivers/misc/eeprom/at25.c +++ b/drivers/misc/eeprom/at25.c @@ -366,6 +366,13 @@ static const struct of_device_id at25_of_match[] = { }; MODULE_DEVICE_TABLE(of, at25_of_match); +static const struct spi_device_id at25_spi_ids[] = { + { .name = "at25",}, + { .name = "fm25",}, + { } +}; +MODULE_DEVICE_TABLE(spi, at25_spi_ids); + static int at25_probe(struct spi_device *spi) { struct at25_data *at25 = NULL; @@ -491,6 +498,7 @@ static struct spi_driver at25_driver = { .dev_groups = sernum_groups, }, .probe = at25_probe, + .id_table = at25_spi_ids, }; module_spi_driver(at25_driver); diff --git a/drivers/misc/eeprom/eeprom_93xx46.c b/drivers/misc/eeprom/eeprom_93xx46.c index 29d8971ec558..1f15399e5cb4 100644 --- a/drivers/misc/eeprom/eeprom_93xx46.c +++ b/drivers/misc/eeprom/eeprom_93xx46.c @@ -406,6 +406,23 @@ static const struct of_device_id eeprom_93xx46_of_table[] = { }; MODULE_DEVICE_TABLE(of, eeprom_93xx46_of_table); +static const struct spi_device_id eeprom_93xx46_spi_ids[] = { + { .name = "eeprom-93xx46", + .driver_data = (kernel_ulong_t)&at93c46_data, }, + { .name = "at93c46", + .driver_data = (kernel_ulong_t)&at93c46_data, }, + { .name = "at93c46d", + .driver_data = (kernel_ulong_t)&atmel_at93c46d_data, }, + { .name = "at93c56", + .driver_data = (kernel_ulong_t)&at93c56_data, }, + { .name = "at93c66", + .driver_data = (kernel_ulong_t)&at93c66_data, }, + { .name = "93lc46b", + .driver_data = (kernel_ulong_t)µchip_93lc46b_data, }, + {} +}; +MODULE_DEVICE_TABLE(spi, eeprom_93xx46_spi_ids); + static int eeprom_93xx46_probe_dt(struct spi_device *spi) { const struct of_device_id *of_id = @@ -555,6 +572,7 @@ static struct spi_driver eeprom_93xx46_driver = { }, .probe = eeprom_93xx46_probe, .remove = eeprom_93xx46_remove, + .id_table = eeprom_93xx46_spi_ids, }; module_spi_driver(eeprom_93xx46_driver); diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index beda610e6b30..ad6ced454655 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -814,10 +814,12 @@ static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx) rpra[i].pv = (u64) ctx->args[i].ptr; pages[i].addr = ctx->maps[i]->phys; + mmap_read_lock(current->mm); vma = find_vma(current->mm, ctx->args[i].ptr); if (vma) pages[i].addr += ctx->args[i].ptr - vma->vm_start; + mmap_read_unlock(current->mm); pg_start = (ctx->args[i].ptr & PAGE_MASK) >> PAGE_SHIFT; pg_end = ((ctx->args[i].ptr + len - 1) & PAGE_MASK) >> diff --git a/drivers/misc/gehc-achc.c b/drivers/misc/gehc-achc.c index 02f33bc60c56..4c9c5394da6f 100644 --- a/drivers/misc/gehc-achc.c +++ b/drivers/misc/gehc-achc.c @@ -539,6 +539,7 @@ static int gehc_achc_probe(struct spi_device *spi) static const struct spi_device_id gehc_achc_id[] = { { "ge,achc", 0 }, + { "achc", 0 }, { } }; MODULE_DEVICE_TABLE(spi, gehc_achc_id); diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 91b57544f7c6..6dafff375f1c 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -2649,11 +2649,18 @@ put_ctx: free_seq_arr: kfree(cs_seq_arr); - /* update output args */ - memset(args, 0, sizeof(*args)); if (rc) return rc; + if (mcs_data.wait_status == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for Multi-CS\n"); + return -EINTR; + } + + /* update output args */ + memset(args, 0, sizeof(*args)); + if (mcs_data.completion_bitmap) { args->out.status = HL_WAIT_CS_STATUS_COMPLETED; args->out.cs_completion_map = mcs_data.completion_bitmap; @@ -2667,8 +2674,6 @@ free_seq_arr: /* update if some CS was gone */ if (mcs_data.timestamp) args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE; - } else if (mcs_data.wait_status == -ERESTARTSYS) { - args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; } else { args->out.status = HL_WAIT_CS_STATUS_BUSY; } @@ -2688,16 +2693,17 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq, &status, ×tamp); + if (rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for CS handle %llu\n", + seq); + return -EINTR; + } + memset(args, 0, sizeof(*args)); if (rc) { - if (rc == -ERESTARTSYS) { - dev_err_ratelimited(hdev->dev, - "user process got signal while waiting for CS handle %llu\n", - seq); - args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; - rc = -EINTR; - } else if (rc == -ETIMEDOUT) { + if (rc == -ETIMEDOUT) { dev_err_ratelimited(hdev->dev, "CS %llu has timed-out while user process is waiting for it\n", seq); @@ -2823,7 +2829,6 @@ wait_again: dev_err_ratelimited(hdev->dev, "user process got signal while waiting for interrupt ID %d\n", interrupt->interrupt_id); - *status = HL_WAIT_CS_STATUS_INTERRUPTED; rc = -EINTR; } else { *status = CS_WAIT_STATUS_BUSY; @@ -2878,8 +2883,6 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) args->in.interrupt_timeout_us, args->in.addr, args->in.target, interrupt_offset, &status); - memset(args, 0, sizeof(*args)); - if (rc) { if (rc != -EINTR) dev_err_ratelimited(hdev->dev, @@ -2888,6 +2891,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) return rc; } + memset(args, 0, sizeof(*args)); + switch (status) { case CS_WAIT_STATUS_COMPLETED: args->out.status = HL_WAIT_CS_STATUS_COMPLETED; diff --git a/drivers/misc/mei/hbm.c b/drivers/misc/mei/hbm.c index 99b5c1ecc444..be41843df75b 100644 --- a/drivers/misc/mei/hbm.c +++ b/drivers/misc/mei/hbm.c @@ -1298,7 +1298,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_STARTING) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: start: on shutdown, ignoring\n"); return 0; } @@ -1381,7 +1382,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_DR_SETUP) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: dma setup response: on shutdown, ignoring\n"); return 0; } @@ -1448,7 +1450,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_CLIENT_PROPERTIES) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: properties response: on shutdown, ignoring\n"); return 0; } @@ -1490,7 +1493,8 @@ int mei_hbm_dispatch(struct mei_device *dev, struct mei_msg_hdr *hdr) if (dev->dev_state != MEI_DEV_INIT_CLIENTS || dev->hbm_state != MEI_HBM_ENUM_CLIENTS) { - if (dev->dev_state == MEI_DEV_POWER_DOWN) { + if (dev->dev_state == MEI_DEV_POWER_DOWN || + dev->dev_state == MEI_DEV_POWERING_DOWN) { dev_dbg(dev->dev, "hbm: enumeration response: on shutdown, ignoring\n"); return 0; } diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index cb34925e10f1..67bb6a25fd0a 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -92,6 +92,7 @@ #define MEI_DEV_ID_CDF 0x18D3 /* Cedar Fork */ #define MEI_DEV_ID_ICP_LP 0x34E0 /* Ice Lake Point LP */ +#define MEI_DEV_ID_ICP_N 0x38E0 /* Ice Lake Point N */ #define MEI_DEV_ID_JSP_N 0x4DE0 /* Jasper Lake Point N */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index c3393b383e59..3a45aaf002ac 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -96,6 +96,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_CMP_H_3, MEI_ME_PCH8_ITOUCH_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ICP_LP, MEI_ME_PCH12_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_ICP_N, MEI_ME_PCH12_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_TGP_LP, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_TGP_H, MEI_ME_PCH15_SPS_CFG)}, diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 431af5e8be2f..74882fa0f86d 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -258,7 +258,7 @@ static ssize_t power_ro_lock_store(struct device *dev, mq = &md->queue; /* Dispatch locking to the block layer */ - req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, 0); + req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_OUT, 0); if (IS_ERR(req)) { count = PTR_ERR(req); goto out_put; @@ -266,7 +266,7 @@ static ssize_t power_ro_lock_store(struct device *dev, req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP; blk_execute_rq(NULL, req, 0); ret = req_to_mmc_queue_req(req)->drv_op_result; - blk_put_request(req); + blk_mq_free_request(req); if (!ret) { pr_info("%s: Locking boot partition ro until next power on\n", @@ -646,7 +646,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, * Dispatch the ioctl() into the block request queue. */ mq = &md->queue; - req = blk_get_request(mq->queue, + req = blk_mq_alloc_request(mq->queue, idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -660,7 +660,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, blk_execute_rq(NULL, req, 0); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata); - blk_put_request(req); + blk_mq_free_request(req); cmd_done: kfree(idata->buf); @@ -716,7 +716,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, * Dispatch the ioctl()s into the block request queue. */ mq = &md->queue; - req = blk_get_request(mq->queue, + req = blk_mq_alloc_request(mq->queue, idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -733,7 +733,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, for (i = 0; i < num_of_cmds && !err; i++) err = mmc_blk_ioctl_copy_to_user(&cmds[i], idata[i]); - blk_put_request(req); + blk_mq_free_request(req); cmd_err: for (i = 0; i < num_of_cmds; i++) { @@ -2730,7 +2730,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) int ret; /* Ask the block layer about the card status */ - req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0); + req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return PTR_ERR(req); req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS; @@ -2740,7 +2740,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) *val = ret; ret = 0; } - blk_put_request(req); + blk_mq_free_request(req); return ret; } @@ -2766,7 +2766,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) return -ENOMEM; /* Ask the block layer for the EXT CSD */ - req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0); + req = blk_mq_alloc_request(mq->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_free; @@ -2775,7 +2775,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) req_to_mmc_queue_req(req)->drv_op_data = &ext_csd; blk_execute_rq(NULL, req, 0); err = req_to_mmc_queue_req(req)->drv_op_result; - blk_put_request(req); + blk_mq_free_request(req); if (err) { pr_err("FAILED %d\n", err); goto out_free; diff --git a/drivers/mmc/core/crypto.c b/drivers/mmc/core/crypto.c index 67557808cada..fec4fbf16a5b 100644 --- a/drivers/mmc/core/crypto.c +++ b/drivers/mmc/core/crypto.c @@ -16,13 +16,13 @@ void mmc_crypto_set_initial_state(struct mmc_host *host) { /* Reset might clear all keys, so reprogram all the keys. */ if (host->caps2 & MMC_CAP2_CRYPTO) - blk_ksm_reprogram_all_keys(&host->ksm); + blk_crypto_reprogram_all_keys(&host->crypto_profile); } void mmc_crypto_setup_queue(struct request_queue *q, struct mmc_host *host) { if (host->caps2 & MMC_CAP2_CRYPTO) - blk_ksm_register(&host->ksm, q); + blk_crypto_register(&host->crypto_profile, q); } EXPORT_SYMBOL_GPL(mmc_crypto_setup_queue); @@ -30,12 +30,15 @@ void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq) { struct request *req = mmc_queue_req_to_req(mqrq); struct mmc_request *mrq = &mqrq->brq.mrq; + struct blk_crypto_keyslot *keyslot; if (!req->crypt_ctx) return; mrq->crypto_ctx = req->crypt_ctx; - if (req->crypt_keyslot) - mrq->crypto_key_slot = blk_ksm_get_slot_idx(req->crypt_keyslot); + + keyslot = req->crypt_keyslot; + if (keyslot) + mrq->crypto_key_slot = blk_crypto_keyslot_index(keyslot); } EXPORT_SYMBOL_GPL(mmc_crypto_prepare_req); diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 4646b7a03db6..c9db24e16af1 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/stat.h> #include <linux/pm_runtime.h> +#include <linux/scatterlist.h> #include <linux/mmc/host.h> #include <linux/mmc/card.h> diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 71313961cc54..ccc148cdb5ee 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -506,7 +506,7 @@ config MMC_OMAP_HS config MMC_WBSD tristate "Winbond W83L51xD SD/MMC Card Interface support" - depends on ISA_DMA_API + depends on ISA_DMA_API && !M68K help This selects the Winbond(R) W83L51xD Secure digital and Multimedia card Interface. @@ -547,7 +547,7 @@ config MMC_SDHCI_MSM depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI - select QCOM_SCM if MMC_CRYPTO && ARCH_QCOM + select QCOM_SCM if MMC_CRYPTO help This selects the Secure Digital Host Controller Interface (SDHCI) support present in Qualcomm SOCs. The controller supports diff --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c index 38559a956330..31f841231609 100644 --- a/drivers/mmc/host/cqhci-core.c +++ b/drivers/mmc/host/cqhci-core.c @@ -282,6 +282,9 @@ static void __cqhci_enable(struct cqhci_host *cq_host) cqhci_writel(cq_host, cqcfg, CQHCI_CFG); + if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT) + cqhci_writel(cq_host, 0, CQHCI_CTL); + mmc->cqe_on = true; if (cq_host->ops->enable) diff --git a/drivers/mmc/host/cqhci-crypto.c b/drivers/mmc/host/cqhci-crypto.c index 6419cfbb4ab7..d5f4b6972f63 100644 --- a/drivers/mmc/host/cqhci-crypto.c +++ b/drivers/mmc/host/cqhci-crypto.c @@ -6,7 +6,7 @@ */ #include <linux/blk-crypto.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> #include <linux/mmc/host.h> #include "cqhci-crypto.h" @@ -23,9 +23,10 @@ static const struct cqhci_crypto_alg_entry { }; static inline struct cqhci_host * -cqhci_host_from_ksm(struct blk_keyslot_manager *ksm) +cqhci_host_from_crypto_profile(struct blk_crypto_profile *profile) { - struct mmc_host *mmc = container_of(ksm, struct mmc_host, ksm); + struct mmc_host *mmc = + container_of(profile, struct mmc_host, crypto_profile); return mmc->cqe_private; } @@ -57,12 +58,12 @@ static int cqhci_crypto_program_key(struct cqhci_host *cq_host, return 0; } -static int cqhci_crypto_keyslot_program(struct blk_keyslot_manager *ksm, +static int cqhci_crypto_keyslot_program(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm); + struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile); const union cqhci_crypto_cap_entry *ccap_array = cq_host->crypto_cap_array; const struct cqhci_crypto_alg_entry *alg = @@ -115,11 +116,11 @@ static int cqhci_crypto_clear_keyslot(struct cqhci_host *cq_host, int slot) return cqhci_crypto_program_key(cq_host, &cfg, slot); } -static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, +static int cqhci_crypto_keyslot_evict(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct cqhci_host *cq_host = cqhci_host_from_ksm(ksm); + struct cqhci_host *cq_host = cqhci_host_from_crypto_profile(profile); return cqhci_crypto_clear_keyslot(cq_host, slot); } @@ -132,7 +133,7 @@ static int cqhci_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, * "enabled" when these are called, i.e. CQHCI_ENABLE might not be set in the * CQHCI_CFG register. But the hardware allows that. */ -static const struct blk_ksm_ll_ops cqhci_ksm_ops = { +static const struct blk_crypto_ll_ops cqhci_crypto_ops = { .keyslot_program = cqhci_crypto_keyslot_program, .keyslot_evict = cqhci_crypto_keyslot_evict, }; @@ -157,8 +158,8 @@ cqhci_find_blk_crypto_mode(union cqhci_crypto_cap_entry cap) * * If the driver previously set MMC_CAP2_CRYPTO and the CQE declares * CQHCI_CAP_CS, initialize the crypto support. This involves reading the - * crypto capability registers, initializing the keyslot manager, clearing all - * keyslots, and enabling 128-bit task descriptors. + * crypto capability registers, initializing the blk_crypto_profile, clearing + * all keyslots, and enabling 128-bit task descriptors. * * Return: 0 if crypto was initialized or isn't supported; whether * MMC_CAP2_CRYPTO remains set indicates which one of those cases it is. @@ -168,7 +169,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host) { struct mmc_host *mmc = cq_host->mmc; struct device *dev = mmc_dev(mmc); - struct blk_keyslot_manager *ksm = &mmc->ksm; + struct blk_crypto_profile *profile = &mmc->crypto_profile; unsigned int num_keyslots; unsigned int cap_idx; enum blk_crypto_mode_num blk_mode_num; @@ -199,15 +200,15 @@ int cqhci_crypto_init(struct cqhci_host *cq_host) */ num_keyslots = cq_host->crypto_capabilities.config_count + 1; - err = devm_blk_ksm_init(dev, ksm, num_keyslots); + err = devm_blk_crypto_profile_init(dev, profile, num_keyslots); if (err) goto out; - ksm->ksm_ll_ops = cqhci_ksm_ops; - ksm->dev = dev; + profile->ll_ops = cqhci_crypto_ops; + profile->dev = dev; /* Unfortunately, CQHCI crypto only supports 32 DUN bits. */ - ksm->max_dun_bytes_supported = 4; + profile->max_dun_bytes_supported = 4; /* * Cache all the crypto capabilities and advertise the supported crypto @@ -223,7 +224,7 @@ int cqhci_crypto_init(struct cqhci_host *cq_host) cq_host->crypto_cap_array[cap_idx]); if (blk_mode_num == BLK_ENCRYPTION_MODE_INVALID) continue; - ksm->crypto_modes_supported[blk_mode_num] |= + profile->modes_supported[blk_mode_num] |= cq_host->crypto_cap_array[cap_idx].sdus_mask * 512; } diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index 0c75810812a0..1f8a3c0ddfe1 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -464,6 +464,18 @@ static s8 dw_mci_exynos_get_best_clksmpl(u8 candiates) } } + /* + * If there is no cadiates value, then it needs to return -EIO. + * If there are candiates values and don't find bset clk sample value, + * then use a first candiates clock sample value. + */ + for (i = 0; i < iter; i++) { + __c = ror8(candiates, i); + if ((__c & 0x1) == 0x1) { + loc = i; + goto out; + } + } out: return loc; } @@ -494,6 +506,8 @@ static int dw_mci_exynos_execute_tuning(struct dw_mci_slot *slot, u32 opcode) priv->tuned_sample = found; } else { ret = -EIO; + dev_warn(&mmc->class_dev, + "There is no candiates value about clksmpl!\n"); } return ret; diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 3f28eb4d17fe..8f36536cb1b6 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -746,7 +746,7 @@ static void meson_mmc_desc_chain_transfer(struct mmc_host *mmc, u32 cmd_cfg) writel(start, host->regs + SD_EMMC_START); } -/* local sg copy to buffer version with _to/fromio usage for dram_access_quirk */ +/* local sg copy for dram_access_quirk */ static void meson_mmc_copy_buffer(struct meson_host *host, struct mmc_data *data, size_t buflen, bool to_buffer) { @@ -764,21 +764,27 @@ static void meson_mmc_copy_buffer(struct meson_host *host, struct mmc_data *data sg_miter_start(&miter, sgl, nents, sg_flags); while ((offset < buflen) && sg_miter_next(&miter)) { - unsigned int len; + unsigned int buf_offset = 0; + unsigned int len, left; + u32 *buf = miter.addr; len = min(miter.length, buflen - offset); + left = len; - /* When dram_access_quirk, the bounce buffer is a iomem mapping */ - if (host->dram_access_quirk) { - if (to_buffer) - memcpy_toio(host->bounce_iomem_buf + offset, miter.addr, len); - else - memcpy_fromio(miter.addr, host->bounce_iomem_buf + offset, len); + if (to_buffer) { + do { + writel(*buf++, host->bounce_iomem_buf + offset + buf_offset); + + buf_offset += 4; + left -= 4; + } while (left); } else { - if (to_buffer) - memcpy(host->bounce_buf + offset, miter.addr, len); - else - memcpy(miter.addr, host->bounce_buf + offset, len); + do { + *buf++ = readl(host->bounce_iomem_buf + offset + buf_offset); + + buf_offset += 4; + left -= 4; + } while (left); } offset += len; @@ -830,7 +836,11 @@ static void meson_mmc_start_cmd(struct mmc_host *mmc, struct mmc_command *cmd) if (data->flags & MMC_DATA_WRITE) { cmd_cfg |= CMD_CFG_DATA_WR; WARN_ON(xfer_bytes > host->bounce_buf_size); - meson_mmc_copy_buffer(host, data, xfer_bytes, true); + if (host->dram_access_quirk) + meson_mmc_copy_buffer(host, data, xfer_bytes, true); + else + sg_copy_to_buffer(data->sg, data->sg_len, + host->bounce_buf, xfer_bytes); dma_wmb(); } @@ -849,12 +859,43 @@ static void meson_mmc_start_cmd(struct mmc_host *mmc, struct mmc_command *cmd) writel(cmd->arg, host->regs + SD_EMMC_CMD_ARG); } +static int meson_mmc_validate_dram_access(struct mmc_host *mmc, struct mmc_data *data) +{ + struct scatterlist *sg; + int i; + + /* Reject request if any element offset or size is not 32bit aligned */ + for_each_sg(data->sg, sg, data->sg_len, i) { + if (!IS_ALIGNED(sg->offset, sizeof(u32)) || + !IS_ALIGNED(sg->length, sizeof(u32))) { + dev_err(mmc_dev(mmc), "unaligned sg offset %u len %u\n", + data->sg->offset, data->sg->length); + return -EINVAL; + } + } + + return 0; +} + static void meson_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) { struct meson_host *host = mmc_priv(mmc); bool needs_pre_post_req = mrq->data && !(mrq->data->host_cookie & SD_EMMC_PRE_REQ_DONE); + /* + * The memory at the end of the controller used as bounce buffer for + * the dram_access_quirk only accepts 32bit read/write access, + * check the aligment and length of the data before starting the request. + */ + if (host->dram_access_quirk && mrq->data) { + mrq->cmd->error = meson_mmc_validate_dram_access(mmc, mrq->data); + if (mrq->cmd->error) { + mmc_request_done(mmc, mrq); + return; + } + } + if (needs_pre_post_req) { meson_mmc_get_transfer_mode(mmc, mrq); if (!meson_mmc_desc_chain_mode(mrq->data)) @@ -999,7 +1040,11 @@ static irqreturn_t meson_mmc_irq_thread(int irq, void *dev_id) if (meson_mmc_bounce_buf_read(data)) { xfer_bytes = data->blksz * data->blocks; WARN_ON(xfer_bytes > host->bounce_buf_size); - meson_mmc_copy_buffer(host, data, xfer_bytes, false); + if (host->dram_access_quirk) + meson_mmc_copy_buffer(host, data, xfer_bytes, false); + else + sg_copy_from_buffer(data->sg, data->sg_len, + host->bounce_buf, xfer_bytes); } next_cmd = meson_mmc_get_next_command(cmd); diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 4dfc246c5f95..b06b4dcb7c78 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2577,6 +2577,25 @@ static int msdc_drv_probe(struct platform_device *pdev) host->dma_mask = DMA_BIT_MASK(32); mmc_dev(mmc)->dma_mask = &host->dma_mask; + host->timeout_clks = 3 * 1048576; + host->dma.gpd = dma_alloc_coherent(&pdev->dev, + 2 * sizeof(struct mt_gpdma_desc), + &host->dma.gpd_addr, GFP_KERNEL); + host->dma.bd = dma_alloc_coherent(&pdev->dev, + MAX_BD_NUM * sizeof(struct mt_bdma_desc), + &host->dma.bd_addr, GFP_KERNEL); + if (!host->dma.gpd || !host->dma.bd) { + ret = -ENOMEM; + goto release_mem; + } + msdc_init_gpd_bd(host, &host->dma); + INIT_DELAYED_WORK(&host->req_timeout, msdc_request_timeout); + spin_lock_init(&host->lock); + + platform_set_drvdata(pdev, mmc); + msdc_ungate_clock(host); + msdc_init_hw(host); + if (mmc->caps2 & MMC_CAP2_CQE) { host->cq_host = devm_kzalloc(mmc->parent, sizeof(*host->cq_host), @@ -2597,25 +2616,6 @@ static int msdc_drv_probe(struct platform_device *pdev) mmc->max_seg_size = 64 * 1024; } - host->timeout_clks = 3 * 1048576; - host->dma.gpd = dma_alloc_coherent(&pdev->dev, - 2 * sizeof(struct mt_gpdma_desc), - &host->dma.gpd_addr, GFP_KERNEL); - host->dma.bd = dma_alloc_coherent(&pdev->dev, - MAX_BD_NUM * sizeof(struct mt_bdma_desc), - &host->dma.bd_addr, GFP_KERNEL); - if (!host->dma.gpd || !host->dma.bd) { - ret = -ENOMEM; - goto release_mem; - } - msdc_init_gpd_bd(host, &host->dma); - INIT_DELAYED_WORK(&host->req_timeout, msdc_request_timeout); - spin_lock_init(&host->lock); - - platform_set_drvdata(pdev, mmc); - msdc_ungate_clock(host); - msdc_init_hw(host); - ret = devm_request_irq(&pdev->dev, host->irq, msdc_irq, IRQF_TRIGGER_NONE, pdev->name, host); if (ret) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index f18d169bc8ff..e658f0174242 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1187,6 +1187,7 @@ static void esdhc_reset_tuning(struct sdhci_host *host) struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct pltfm_imx_data *imx_data = sdhci_pltfm_priv(pltfm_host); u32 ctrl; + int ret; /* Reset the tuning circuit */ if (esdhc_is_usdhc(imx_data)) { @@ -1199,7 +1200,22 @@ static void esdhc_reset_tuning(struct sdhci_host *host) } else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) { ctrl = readl(host->ioaddr + SDHCI_AUTO_CMD_STATUS); ctrl &= ~ESDHC_MIX_CTRL_SMPCLK_SEL; + ctrl &= ~ESDHC_MIX_CTRL_EXE_TUNE; writel(ctrl, host->ioaddr + SDHCI_AUTO_CMD_STATUS); + /* Make sure ESDHC_MIX_CTRL_EXE_TUNE cleared */ + ret = readl_poll_timeout(host->ioaddr + SDHCI_AUTO_CMD_STATUS, + ctrl, !(ctrl & ESDHC_MIX_CTRL_EXE_TUNE), 1, 50); + if (ret == -ETIMEDOUT) + dev_warn(mmc_dev(host->mmc), + "Warning! clear execute tuning bit failed\n"); + /* + * SDHCI_INT_DATA_AVAIL is W1C bit, set this bit will clear the + * usdhc IP internal logic flag execute_tuning_with_clr_buf, which + * will finally make sure the normal data transfer logic correct. + */ + ctrl = readl(host->ioaddr + SDHCI_INT_STATUS); + ctrl |= SDHCI_INT_DATA_AVAIL; + writel(ctrl, host->ioaddr + SDHCI_INT_STATUS); } } } diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 5564d7b23e7c..d1a1c548c515 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -11,6 +11,7 @@ #include <linux/delay.h> #include <linux/err.h> #include <linux/io.h> +#include <linux/iopoll.h> #include <linux/kernel.h> #include <linux/mmc/host.h> #include <linux/mmc/slot-gpio.h> @@ -61,7 +62,6 @@ static void sdhci_at91_set_force_card_detect(struct sdhci_host *host) static void sdhci_at91_set_clock(struct sdhci_host *host, unsigned int clock) { u16 clk; - unsigned long timeout; host->mmc->actual_clock = 0; @@ -86,16 +86,11 @@ static void sdhci_at91_set_clock(struct sdhci_host *host, unsigned int clock) sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); /* Wait max 20 ms */ - timeout = 20; - while (!((clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL)) - & SDHCI_CLOCK_INT_STABLE)) { - if (timeout == 0) { - pr_err("%s: Internal clock never stabilised.\n", - mmc_hostname(host->mmc)); - return; - } - timeout--; - mdelay(1); + if (read_poll_timeout(sdhci_readw, clk, (clk & SDHCI_CLOCK_INT_STABLE), + 1000, 20000, false, host, SDHCI_CLOCK_CONTROL)) { + pr_err("%s: Internal clock never stabilised.\n", + mmc_hostname(host->mmc)); + return; } clk |= SDHCI_CLOCK_CARD_EN; @@ -114,6 +109,7 @@ static void sdhci_at91_reset(struct sdhci_host *host, u8 mask) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_at91_priv *priv = sdhci_pltfm_priv(pltfm_host); + unsigned int tmp; sdhci_reset(host, mask); @@ -126,6 +122,10 @@ static void sdhci_at91_reset(struct sdhci_host *host, u8 mask) sdhci_writel(host, calcr | SDMMC_CALCR_ALWYSON | SDMMC_CALCR_EN, SDMMC_CALCR); + + if (read_poll_timeout(sdhci_readl, tmp, !(tmp & SDMMC_CALCR_EN), + 10, 20000, false, host, SDMMC_CALCR)) + dev_err(mmc_dev(host->mmc), "Failed to calibrate\n"); } } diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index be19785227fe..d0f2edfe296c 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -616,16 +616,12 @@ static int intel_select_drive_strength(struct mmc_card *card, return intel_host->drv_strength; } -static int bxt_get_cd(struct mmc_host *mmc) +static int sdhci_get_cd_nogpio(struct mmc_host *mmc) { - int gpio_cd = mmc_gpio_get_cd(mmc); struct sdhci_host *host = mmc_priv(mmc); unsigned long flags; int ret = 0; - if (!gpio_cd) - return 0; - spin_lock_irqsave(&host->lock, flags); if (host->flags & SDHCI_DEVICE_DEAD) @@ -638,6 +634,21 @@ out: return ret; } +static int bxt_get_cd(struct mmc_host *mmc) +{ + int gpio_cd = mmc_gpio_get_cd(mmc); + + if (!gpio_cd) + return 0; + + return sdhci_get_cd_nogpio(mmc); +} + +static int mrfld_get_cd(struct mmc_host *mmc) +{ + return sdhci_get_cd_nogpio(mmc); +} + #define SDHCI_INTEL_PWR_TIMEOUT_CNT 20 #define SDHCI_INTEL_PWR_TIMEOUT_UDELAY 100 @@ -1341,6 +1352,14 @@ static int intel_mrfld_mmc_probe_slot(struct sdhci_pci_slot *slot) MMC_CAP_1_8V_DDR; break; case INTEL_MRFLD_SD: + slot->cd_idx = 0; + slot->cd_override_level = true; + /* + * There are two PCB designs of SD card slot with the opposite + * card detection sense. Quirk this out by ignoring GPIO state + * completely in the custom ->get_cd() callback. + */ + slot->host->mmc_host_ops.get_cd = mrfld_get_cd; slot->host->quirks2 |= SDHCI_QUIRK2_NO_1_8_V; break; case INTEL_MRFLD_SDIO: diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 8eefa7d5fe85..2d80a04e11d8 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2042,6 +2042,12 @@ void sdhci_set_power_noreg(struct sdhci_host *host, unsigned char mode, break; case MMC_VDD_32_33: case MMC_VDD_33_34: + /* + * 3.4 ~ 3.6V are valid only for those platforms where it's + * known that the voltage range is supported by hardware. + */ + case MMC_VDD_34_35: + case MMC_VDD_35_36: pwr = SDHCI_POWER_330; break; default: diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 7dfc26f48c18..e2affa52ef46 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -195,6 +195,10 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host) sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, host->sdcard_irq_mask_all); host->sdcard_irq_mask = host->sdcard_irq_mask_all; + if (host->native_hotplug) + tmio_mmc_enable_mmc_irqs(host, + TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT); + tmio_mmc_set_bus_width(host, host->mmc->ios.bus_width); if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { @@ -956,8 +960,15 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) case MMC_POWER_OFF: tmio_mmc_power_off(host); /* For R-Car Gen2+, we need to reset SDHI specific SCC */ - if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) + if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) { host->reset(host); + + if (host->native_hotplug) + tmio_mmc_enable_mmc_irqs(host, + TMIO_STAT_CARD_REMOVE | + TMIO_STAT_CARD_INSERT); + } + host->set_clock(host, 0); break; case MMC_POWER_UP: @@ -1185,10 +1196,6 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) _host->set_clock(_host, 0); tmio_mmc_reset(_host); - if (_host->native_hotplug) - tmio_mmc_enable_mmc_irqs(_host, - TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT); - spin_lock_init(&_host->lock); mutex_init(&_host->ios_lock); diff --git a/drivers/mmc/host/vub300.c b/drivers/mmc/host/vub300.c index 4950d10d3a19..97beece62fec 100644 --- a/drivers/mmc/host/vub300.c +++ b/drivers/mmc/host/vub300.c @@ -576,7 +576,7 @@ static void check_vub300_port_status(struct vub300_mmc_host *vub300) GET_SYSTEM_PORT_STATUS, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, &vub300->system_port_status, - sizeof(vub300->system_port_status), HZ); + sizeof(vub300->system_port_status), 1000); if (sizeof(vub300->system_port_status) == retval) new_system_port_status(vub300); } @@ -1241,7 +1241,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300, SET_INTERRUPT_PSEUDOCODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, - xfer_buffer, xfer_length, HZ); + xfer_buffer, xfer_length, 1000); kfree(xfer_buffer); if (retval < 0) goto copy_error_message; @@ -1284,7 +1284,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300, SET_TRANSFER_PSEUDOCODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, - xfer_buffer, xfer_length, HZ); + xfer_buffer, xfer_length, 1000); kfree(xfer_buffer); if (retval < 0) goto copy_error_message; @@ -1991,7 +1991,7 @@ static void __set_clock_speed(struct vub300_mmc_host *vub300, u8 buf[8], usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_CLOCK_SPEED, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - 0x00, 0x00, buf, buf_array_size, HZ); + 0x00, 0x00, buf, buf_array_size, 1000); if (retval != 8) { dev_err(&vub300->udev->dev, "SET_CLOCK_SPEED" " %dkHz failed with retval=%d\n", kHzClock, retval); @@ -2013,14 +2013,14 @@ static void vub300_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_SD_POWER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - 0x0000, 0x0000, NULL, 0, HZ); + 0x0000, 0x0000, NULL, 0, 1000); /* must wait for the VUB300 u-proc to boot up */ msleep(600); } else if ((ios->power_mode == MMC_POWER_UP) && !vub300->card_powered) { usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_SD_POWER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - 0x0001, 0x0000, NULL, 0, HZ); + 0x0001, 0x0000, NULL, 0, 1000); msleep(600); vub300->card_powered = 1; } else if (ios->power_mode == MMC_POWER_ON) { @@ -2275,14 +2275,14 @@ static int vub300_probe(struct usb_interface *interface, GET_HC_INF0, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, &vub300->hc_info, - sizeof(vub300->hc_info), HZ); + sizeof(vub300->hc_info), 1000); if (retval < 0) goto error5; retval = usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_ROM_WAIT_STATES, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - firmware_rom_wait_states, 0x0000, NULL, 0, HZ); + firmware_rom_wait_states, 0x0000, NULL, 0, 1000); if (retval < 0) goto error5; dev_info(&vub300->udev->dev, @@ -2297,7 +2297,7 @@ static int vub300_probe(struct usb_interface *interface, GET_SYSTEM_PORT_STATUS, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, &vub300->system_port_status, - sizeof(vub300->system_port_status), HZ); + sizeof(vub300->system_port_status), 1000); if (retval < 0) { goto error4; } else if (sizeof(vub300->system_port_status) == retval) { diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index b8ae1ec14e17..4eaba6f4ec68 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -384,7 +384,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) if (new->readonly) set_disk_ro(gd, 1); - device_add_disk(&new->mtd->dev, gd, NULL); + ret = device_add_disk(&new->mtd->dev, gd, NULL); + if (ret) + goto out_cleanup_disk; if (new->disk_attributes) { ret = sysfs_create_group(&disk_to_dev(gd)->kobj, @@ -393,6 +395,8 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) } return 0; +out_cleanup_disk: + blk_cleanup_disk(new->disk); out_free_tag_set: blk_mq_free_tag_set(new->tag_set); out_kfree_tag_set: diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c index 38b6aa849c63..5ff001140ef4 100644 --- a/drivers/mtd/mtdsuper.c +++ b/drivers/mtd/mtdsuper.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/major.h> #include <linux/backing-dev.h> +#include <linux/blkdev.h> #include <linux/fs_context.h> #include "mtdcore.h" diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index ef0badea4f41..04e6f7b26706 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -1676,13 +1676,17 @@ qcom_nandc_read_cw_raw(struct mtd_info *mtd, struct nand_chip *chip, struct nand_ecc_ctrl *ecc = &chip->ecc; int data_size1, data_size2, oob_size1, oob_size2; int ret, reg_off = FLASH_BUF_ACC, read_loc = 0; + int raw_cw = cw; nand_read_page_op(chip, page, 0, NULL, 0); host->use_ecc = false; + if (nandc->props->qpic_v2) + raw_cw = ecc->steps - 1; + clear_bam_transaction(nandc); set_address(host, host->cw_size * cw, page); - update_rw_regs(host, 1, true, cw); + update_rw_regs(host, 1, true, raw_cw); config_nand_page_read(chip); data_size1 = mtd->writesize - host->cw_size * (ecc->steps - 1); @@ -1711,7 +1715,7 @@ qcom_nandc_read_cw_raw(struct mtd_info *mtd, struct nand_chip *chip, nandc_set_read_loc(chip, cw, 3, read_loc, oob_size2, 1); } - config_nand_cw_read(chip, false, cw); + config_nand_cw_read(chip, false, raw_cw); read_data_dma(nandc, reg_off, data_buf, data_size1, 0); reg_off += data_size1; diff --git a/drivers/net/can/m_can/m_can_platform.c b/drivers/net/can/m_can/m_can_platform.c index 308d4f2fff00..eee47bad0592 100644 --- a/drivers/net/can/m_can/m_can_platform.c +++ b/drivers/net/can/m_can/m_can_platform.c @@ -32,8 +32,13 @@ static u32 iomap_read_reg(struct m_can_classdev *cdev, int reg) static int iomap_read_fifo(struct m_can_classdev *cdev, int offset, void *val, size_t val_count) { struct m_can_plat_priv *priv = cdev_to_priv(cdev); + void __iomem *src = priv->mram_base + offset; - ioread32_rep(priv->mram_base + offset, val, val_count); + while (val_count--) { + *(unsigned int *)val = ioread32(src); + val += 4; + src += 4; + } return 0; } @@ -51,8 +56,13 @@ static int iomap_write_fifo(struct m_can_classdev *cdev, int offset, const void *val, size_t val_count) { struct m_can_plat_priv *priv = cdev_to_priv(cdev); + void __iomem *dst = priv->mram_base + offset; - iowrite32_rep(priv->base + offset, val, val_count); + while (val_count--) { + iowrite32(*(unsigned int *)val, dst); + val += 4; + dst += 4; + } return 0; } diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index 00e4533c8bdd..8999ec9455ec 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -846,10 +846,12 @@ static int __maybe_unused rcar_can_suspend(struct device *dev) struct rcar_can_priv *priv = netdev_priv(ndev); u16 ctlr; - if (netif_running(ndev)) { - netif_stop_queue(ndev); - netif_device_detach(ndev); - } + if (!netif_running(ndev)) + return 0; + + netif_stop_queue(ndev); + netif_device_detach(ndev); + ctlr = readw(&priv->regs->ctlr); ctlr |= RCAR_CAN_CTLR_CANM_HALT; writew(ctlr, &priv->regs->ctlr); @@ -868,6 +870,9 @@ static int __maybe_unused rcar_can_resume(struct device *dev) u16 ctlr; int err; + if (!netif_running(ndev)) + return 0; + err = clk_enable(priv->clk); if (err) { netdev_err(ndev, "clk_enable() failed, error %d\n", err); @@ -881,10 +886,9 @@ static int __maybe_unused rcar_can_resume(struct device *dev) writew(ctlr, &priv->regs->ctlr); priv->can.state = CAN_STATE_ERROR_ACTIVE; - if (netif_running(ndev)) { - netif_device_attach(ndev); - netif_start_queue(ndev); - } + netif_device_attach(ndev); + netif_start_queue(ndev); + return 0; } diff --git a/drivers/net/can/sja1000/peak_pci.c b/drivers/net/can/sja1000/peak_pci.c index 6db90dc4bc9d..84f34020aafb 100644 --- a/drivers/net/can/sja1000/peak_pci.c +++ b/drivers/net/can/sja1000/peak_pci.c @@ -752,16 +752,15 @@ static void peak_pci_remove(struct pci_dev *pdev) struct net_device *prev_dev = chan->prev_dev; dev_info(&pdev->dev, "removing device %s\n", dev->name); + /* do that only for first channel */ + if (!prev_dev && chan->pciec_card) + peak_pciec_remove(chan->pciec_card); unregister_sja1000dev(dev); free_sja1000dev(dev); dev = prev_dev; - if (!dev) { - /* do that only for first channel */ - if (chan->pciec_card) - peak_pciec_remove(chan->pciec_card); + if (!dev) break; - } priv = netdev_priv(dev); chan = priv->priv; } diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index b11eabad575b..09029a3bad1a 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -551,11 +551,10 @@ static int pcan_usb_fd_decode_status(struct pcan_usb_fd_if *usb_if, } else if (sm->channel_p_w_b & PUCAN_BUS_WARNING) { new_state = CAN_STATE_ERROR_WARNING; } else { - /* no error bit (so, no error skb, back to active state) */ - dev->can.state = CAN_STATE_ERROR_ACTIVE; + /* back to (or still in) ERROR_ACTIVE state */ + new_state = CAN_STATE_ERROR_ACTIVE; pdev->bec.txerr = 0; pdev->bec.rxerr = 0; - return 0; } /* state hasn't changed */ @@ -568,8 +567,7 @@ static int pcan_usb_fd_decode_status(struct pcan_usb_fd_if *usb_if, /* allocate an skb to store the error frame */ skb = alloc_can_err_skb(netdev, &cf); - if (skb) - can_change_state(netdev, cf, tx_state, rx_state); + can_change_state(netdev, cf, tx_state, rx_state); /* things must be done even in case of OOM */ if (new_state == CAN_STATE_BUS_OFF) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 3ff4b7e177f3..dbd4486a173f 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -230,7 +230,7 @@ #define GSWIP_SDMA_PCTRLp(p) (0xBC0 + ((p) * 0x6)) #define GSWIP_SDMA_PCTRL_EN BIT(0) /* SDMA Port Enable */ #define GSWIP_SDMA_PCTRL_FCEN BIT(1) /* Flow Control Enable */ -#define GSWIP_SDMA_PCTRL_PAUFWD BIT(1) /* Pause Frame Forwarding */ +#define GSWIP_SDMA_PCTRL_PAUFWD BIT(3) /* Pause Frame Forwarding */ #define GSWIP_TABLE_ACTIVE_VLAN 0x01 #define GSWIP_TABLE_VLAN_MAPPING 0x02 diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 1542bfb8b5e5..7c2968a639eb 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -449,8 +449,10 @@ EXPORT_SYMBOL(ksz_switch_register); void ksz_switch_remove(struct ksz_device *dev) { /* timer started */ - if (dev->mib_read_interval) + if (dev->mib_read_interval) { + dev->mib_read_interval = 0; cancel_delayed_work_sync(&dev->mib_read); + } dev->dev_ops->exit(dev); dsa_unregister_switch(dev->ds); diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 094737e5084a..9890672a206d 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1035,9 +1035,6 @@ mt7530_port_enable(struct dsa_switch *ds, int port, { struct mt7530_priv *priv = ds->priv; - if (!dsa_is_user_port(ds, port)) - return 0; - mutex_lock(&priv->reg_mutex); /* Allow the user port gets connected to the cpu port and also @@ -1060,9 +1057,6 @@ mt7530_port_disable(struct dsa_switch *ds, int port) { struct mt7530_priv *priv = ds->priv; - if (!dsa_is_user_port(ds, port)) - return; - mutex_lock(&priv->reg_mutex); /* Clear up all port matrix which could be restored in the next @@ -3211,7 +3205,7 @@ mt7530_probe(struct mdio_device *mdiodev) return -ENOMEM; priv->ds->dev = &mdiodev->dev; - priv->ds->num_ports = DSA_MAX_PORTS; + priv->ds->num_ports = MT7530_NUM_PORTS; /* Use medatek,mcm property to distinguish hardware type that would * casues a little bit differences on power-on sequence. diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 03744d1c43fc..8dadcae93c9b 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -12,6 +12,7 @@ #include <linux/bitfield.h> #include <linux/delay.h> +#include <linux/dsa/mv88e6xxx.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/if_bridge.h> @@ -749,7 +750,11 @@ static void mv88e6xxx_mac_link_down(struct dsa_switch *ds, int port, ops = chip->info->ops; mv88e6xxx_reg_lock(chip); - if ((!mv88e6xxx_port_ppu_updates(chip, port) || + /* Internal PHYs propagate their configuration directly to the MAC. + * External PHYs depend on whether the PPU is enabled for this port. + */ + if (((!mv88e6xxx_phy_is_internal(ds, port) && + !mv88e6xxx_port_ppu_updates(chip, port)) || mode == MLO_AN_FIXED) && ops->port_sync_link) err = ops->port_sync_link(chip, port, mode, false); mv88e6xxx_reg_unlock(chip); @@ -772,7 +777,12 @@ static void mv88e6xxx_mac_link_up(struct dsa_switch *ds, int port, ops = chip->info->ops; mv88e6xxx_reg_lock(chip); - if (!mv88e6xxx_port_ppu_updates(chip, port) || mode == MLO_AN_FIXED) { + /* Internal PHYs propagate their configuration directly to the MAC. + * External PHYs depend on whether the PPU is enabled for this port. + */ + if ((!mv88e6xxx_phy_is_internal(ds, port) && + !mv88e6xxx_port_ppu_updates(chip, port)) || + mode == MLO_AN_FIXED) { /* FIXME: for an automedia port, should we force the link * down here - what if the link comes up due to "other" media * while we're bringing the port up, how is the exclusivity @@ -1677,6 +1687,30 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, return 0; } +static int mv88e6xxx_port_commit_pvid(struct mv88e6xxx_chip *chip, int port) +{ + struct dsa_port *dp = dsa_to_port(chip->ds, port); + struct mv88e6xxx_port *p = &chip->ports[port]; + u16 pvid = MV88E6XXX_VID_STANDALONE; + bool drop_untagged = false; + int err; + + if (dp->bridge_dev) { + if (br_vlan_enabled(dp->bridge_dev)) { + pvid = p->bridge_pvid.vid; + drop_untagged = !p->bridge_pvid.valid; + } else { + pvid = MV88E6XXX_VID_BRIDGED; + } + } + + err = mv88e6xxx_port_set_pvid(chip, port, pvid); + if (err) + return err; + + return mv88e6xxx_port_drop_untagged(chip, port, drop_untagged); +} + static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack) @@ -1690,7 +1724,16 @@ static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, return -EOPNOTSUPP; mv88e6xxx_reg_lock(chip); + err = mv88e6xxx_port_set_8021q_mode(chip, port, mode); + if (err) + goto unlock; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + goto unlock; + +unlock: mv88e6xxx_reg_unlock(chip); return err; @@ -1725,11 +1768,15 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port, u16 fid; int err; - /* Null VLAN ID corresponds to the port private database */ + /* Ports have two private address databases: one for when the port is + * standalone and one for when the port is under a bridge and the + * 802.1Q mode is disabled. When the port is standalone, DSA wants its + * address database to remain 100% empty, so we never load an ATU entry + * into a standalone port's database. Therefore, translate the null + * VLAN ID into the port's database used for VLAN-unaware bridging. + */ if (vid == 0) { - err = mv88e6xxx_port_get_fid(chip, port, &fid); - if (err) - return err; + fid = MV88E6XXX_FID_BRIDGED; } else { err = mv88e6xxx_vtu_get(chip, vid, &vlan); if (err) @@ -2123,6 +2170,7 @@ static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, struct mv88e6xxx_chip *chip = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; + struct mv88e6xxx_port *p = &chip->ports[port]; bool warn; u8 member; int err; @@ -2156,13 +2204,21 @@ static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, } if (pvid) { - err = mv88e6xxx_port_set_pvid(chip, port, vlan->vid); - if (err) { - dev_err(ds->dev, "p%d: failed to set PVID %d\n", - port, vlan->vid); + p->bridge_pvid.vid = vlan->vid; + p->bridge_pvid.valid = true; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + goto out; + } else if (vlan->vid && p->bridge_pvid.vid == vlan->vid) { + /* The old pvid was reinstalled as a non-pvid VLAN */ + p->bridge_pvid.valid = false; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) goto out; - } } + out: mv88e6xxx_reg_unlock(chip); @@ -2212,6 +2268,7 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct mv88e6xxx_chip *chip = ds->priv; + struct mv88e6xxx_port *p = &chip->ports[port]; int err = 0; u16 pvid; @@ -2229,7 +2286,9 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, goto unlock; if (vlan->vid == pvid) { - err = mv88e6xxx_port_set_pvid(chip, port, 0); + p->bridge_pvid.valid = false; + + err = mv88e6xxx_port_commit_pvid(chip, port); if (err) goto unlock; } @@ -2393,7 +2452,16 @@ static int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, int err; mv88e6xxx_reg_lock(chip); + err = mv88e6xxx_bridge_map(chip, br); + if (err) + goto unlock; + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + goto unlock; + +unlock: mv88e6xxx_reg_unlock(chip); return err; @@ -2403,11 +2471,20 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, struct net_device *br) { struct mv88e6xxx_chip *chip = ds->priv; + int err; mv88e6xxx_reg_lock(chip); + if (mv88e6xxx_bridge_map(chip, br) || mv88e6xxx_port_vlan_map(chip, port)) dev_err(ds->dev, "failed to remap in-chip Port VLAN\n"); + + err = mv88e6xxx_port_commit_pvid(chip, port); + if (err) + dev_err(ds->dev, + "port %d failed to restore standalone pvid: %pe\n", + port, ERR_PTR(err)); + mv88e6xxx_reg_unlock(chip); } @@ -2853,6 +2930,20 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) if (err) return err; + /* Associate MV88E6XXX_VID_BRIDGED with MV88E6XXX_FID_BRIDGED in the + * ATU by virtue of the fact that mv88e6xxx_atu_new() will pick it as + * the first free FID after MV88E6XXX_FID_STANDALONE. This will be used + * as the private PVID on ports under a VLAN-unaware bridge. + * Shared (DSA and CPU) ports must also be members of it, to translate + * the VID from the DSA tag into MV88E6XXX_FID_BRIDGED, instead of + * relying on their port default FID. + */ + err = mv88e6xxx_port_vlan_join(chip, port, MV88E6XXX_VID_BRIDGED, + MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_UNTAGGED, + false); + if (err) + return err; + if (chip->info->ops->port_set_jumbo_size) { err = chip->info->ops->port_set_jumbo_size(chip, port, 10218); if (err) @@ -2925,7 +3016,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) * database, and allow bidirectional communication between the * CPU and DSA port(s), and the other ports. */ - err = mv88e6xxx_port_set_fid(chip, port, 0); + err = mv88e6xxx_port_set_fid(chip, port, MV88E6XXX_FID_STANDALONE); if (err) return err; @@ -3115,6 +3206,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) } } + err = mv88e6xxx_vtu_setup(chip); + if (err) + goto unlock; + /* Setup Switch Port Registers */ for (i = 0; i < mv88e6xxx_num_ports(chip); i++) { if (dsa_is_unused_port(ds, i)) @@ -3144,10 +3239,6 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) if (err) goto unlock; - err = mv88e6xxx_vtu_setup(chip); - if (err) - goto unlock; - err = mv88e6xxx_pvt_setup(chip); if (err) goto unlock; diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 59f316cc8583..8271b8aa7b71 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -21,6 +21,9 @@ #define EDSA_HLEN 8 #define MV88E6XXX_N_FID 4096 +#define MV88E6XXX_FID_STANDALONE 0 +#define MV88E6XXX_FID_BRIDGED 1 + /* PVT limits for 4-bit port and 5-bit switch */ #define MV88E6XXX_MAX_PVT_SWITCHES 32 #define MV88E6XXX_MAX_PVT_PORTS 16 @@ -246,9 +249,15 @@ struct mv88e6xxx_policy { u16 vid; }; +struct mv88e6xxx_vlan { + u16 vid; + bool valid; +}; + struct mv88e6xxx_port { struct mv88e6xxx_chip *chip; int port; + struct mv88e6xxx_vlan bridge_pvid; u64 serdes_stats[2]; u64 atu_member_violation; u64 atu_miss_violation; diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index 451028c57af8..d9817b20ea64 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -1257,6 +1257,27 @@ int mv88e6xxx_port_set_8021q_mode(struct mv88e6xxx_chip *chip, int port, return 0; } +int mv88e6xxx_port_drop_untagged(struct mv88e6xxx_chip *chip, int port, + bool drop_untagged) +{ + u16 old, new; + int err; + + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL2, &old); + if (err) + return err; + + if (drop_untagged) + new = old | MV88E6XXX_PORT_CTL2_DISCARD_UNTAGGED; + else + new = old & ~MV88E6XXX_PORT_CTL2_DISCARD_UNTAGGED; + + if (new == old) + return 0; + + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, new); +} + int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port) { u16 reg; diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h index b10e5aebacf6..03382b66f800 100644 --- a/drivers/net/dsa/mv88e6xxx/port.h +++ b/drivers/net/dsa/mv88e6xxx/port.h @@ -423,6 +423,8 @@ int mv88e6393x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, phy_interface_t mode); int mv88e6185_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode); int mv88e6352_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode); +int mv88e6xxx_port_drop_untagged(struct mv88e6xxx_chip *chip, int port, + bool drop_untagged); int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port); int mv88e6095_port_set_upstream_port(struct mv88e6xxx_chip *chip, int port, int upstream_port); diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index a3a9636430d6..341236dcbdb4 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -266,12 +266,12 @@ static void felix_8021q_cpu_port_deinit(struct ocelot *ocelot, int port) */ static int felix_setup_mmio_filtering(struct felix *felix) { - unsigned long user_ports = 0, cpu_ports = 0; + unsigned long user_ports = dsa_user_ports(felix->ds); struct ocelot_vcap_filter *redirect_rule; struct ocelot_vcap_filter *tagging_rule; struct ocelot *ocelot = &felix->ocelot; struct dsa_switch *ds = felix->ds; - int port, ret; + int cpu = -1, port, ret; tagging_rule = kzalloc(sizeof(struct ocelot_vcap_filter), GFP_KERNEL); if (!tagging_rule) @@ -284,12 +284,15 @@ static int felix_setup_mmio_filtering(struct felix *felix) } for (port = 0; port < ocelot->num_phys_ports; port++) { - if (dsa_is_user_port(ds, port)) - user_ports |= BIT(port); - if (dsa_is_cpu_port(ds, port)) - cpu_ports |= BIT(port); + if (dsa_is_cpu_port(ds, port)) { + cpu = port; + break; + } } + if (cpu < 0) + return -EINVAL; + tagging_rule->key_type = OCELOT_VCAP_KEY_ETYPE; *(__be16 *)tagging_rule->key.etype.etype.value = htons(ETH_P_1588); *(__be16 *)tagging_rule->key.etype.etype.mask = htons(0xffff); @@ -325,7 +328,7 @@ static int felix_setup_mmio_filtering(struct felix *felix) * the CPU port module */ redirect_rule->action.mask_mode = OCELOT_MASK_MODE_REDIRECT; - redirect_rule->action.port_mask = cpu_ports; + redirect_rule->action.port_mask = BIT(cpu); } else { /* Trap PTP packets only to the CPU port module (which is * redirected to the NPI port) @@ -1074,6 +1077,101 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) return 0; } +static void ocelot_port_purge_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *skb) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; + struct sk_buff *skb_match = NULL, *skb_tmp; + unsigned long flags; + + if (!clone) + return; + + spin_lock_irqsave(&ocelot_port->tx_skbs.lock, flags); + + skb_queue_walk_safe(&ocelot_port->tx_skbs, skb, skb_tmp) { + if (skb != clone) + continue; + __skb_unlink(skb, &ocelot_port->tx_skbs); + skb_match = skb; + break; + } + + spin_unlock_irqrestore(&ocelot_port->tx_skbs.lock, flags); + + WARN_ONCE(!skb_match, + "Could not find skb clone in TX timestamping list\n"); +} + +#define work_to_xmit_work(w) \ + container_of((w), struct felix_deferred_xmit_work, work) + +static void felix_port_deferred_xmit(struct kthread_work *work) +{ + struct felix_deferred_xmit_work *xmit_work = work_to_xmit_work(work); + struct dsa_switch *ds = xmit_work->dp->ds; + struct sk_buff *skb = xmit_work->skb; + u32 rew_op = ocelot_ptp_rew_op(skb); + struct ocelot *ocelot = ds->priv; + int port = xmit_work->dp->index; + int retries = 10; + + do { + if (ocelot_can_inject(ocelot, 0)) + break; + + cpu_relax(); + } while (--retries); + + if (!retries) { + dev_err(ocelot->dev, "port %d failed to inject skb\n", + port); + ocelot_port_purge_txtstamp_skb(ocelot, port, skb); + kfree_skb(skb); + return; + } + + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + + consume_skb(skb); + kfree(xmit_work); +} + +static int felix_port_setup_tagger_data(struct dsa_switch *ds, int port) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct ocelot *ocelot = ds->priv; + struct felix *felix = ocelot_to_felix(ocelot); + struct felix_port *felix_port; + + if (!dsa_port_is_user(dp)) + return 0; + + felix_port = kzalloc(sizeof(*felix_port), GFP_KERNEL); + if (!felix_port) + return -ENOMEM; + + felix_port->xmit_worker = felix->xmit_worker; + felix_port->xmit_work_fn = felix_port_deferred_xmit; + + dp->priv = felix_port; + + return 0; +} + +static void felix_port_teardown_tagger_data(struct dsa_switch *ds, int port) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct felix_port *felix_port = dp->priv; + + if (!felix_port) + return; + + dp->priv = NULL; + kfree(felix_port); +} + /* Hardware initialization done here so that we can allocate structures with * devm without fear of dsa_register_switch returning -EPROBE_DEFER and causing * us to allocate structures twice (leak memory) and map PCI memory twice @@ -1102,6 +1200,12 @@ static int felix_setup(struct dsa_switch *ds) } } + felix->xmit_worker = kthread_create_worker(0, "felix_xmit"); + if (IS_ERR(felix->xmit_worker)) { + err = PTR_ERR(felix->xmit_worker); + goto out_deinit_timestamp; + } + for (port = 0; port < ds->num_ports; port++) { if (dsa_is_unused_port(ds, port)) continue; @@ -1112,6 +1216,14 @@ static int felix_setup(struct dsa_switch *ds) * bits of vlan tag. */ felix_port_qos_map_init(ocelot, port); + + err = felix_port_setup_tagger_data(ds, port); + if (err) { + dev_err(ds->dev, + "port %d failed to set up tagger data: %pe\n", + port, ERR_PTR(err)); + goto out_deinit_ports; + } } err = ocelot_devlink_sb_register(ocelot); @@ -1126,6 +1238,7 @@ static int felix_setup(struct dsa_switch *ds) * there's no real point in checking for errors. */ felix_set_tag_protocol(ds, port, felix->tag_proto); + break; } ds->mtu_enforcement_ingress = true; @@ -1138,9 +1251,13 @@ out_deinit_ports: if (dsa_is_unused_port(ds, port)) continue; + felix_port_teardown_tagger_data(ds, port); ocelot_deinit_port(ocelot, port); } + kthread_destroy_worker(felix->xmit_worker); + +out_deinit_timestamp: ocelot_deinit_timestamp(ocelot); ocelot_deinit(ocelot); @@ -1162,19 +1279,23 @@ static void felix_teardown(struct dsa_switch *ds) continue; felix_del_tag_protocol(ds, port, felix->tag_proto); + break; } - ocelot_devlink_sb_unregister(ocelot); - ocelot_deinit_timestamp(ocelot); - ocelot_deinit(ocelot); - for (port = 0; port < ocelot->num_phys_ports; port++) { if (dsa_is_unused_port(ds, port)) continue; + felix_port_teardown_tagger_data(ds, port); ocelot_deinit_port(ocelot, port); } + kthread_destroy_worker(felix->xmit_worker); + + ocelot_devlink_sb_unregister(ocelot); + ocelot_deinit_timestamp(ocelot); + ocelot_deinit(ocelot); + if (felix->info->mdio_bus_free) felix->info->mdio_bus_free(ocelot); } @@ -1291,8 +1412,12 @@ static void felix_txtstamp(struct dsa_switch *ds, int port, if (!ocelot->ptp) return; - if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) + if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) { + dev_err_ratelimited(ds->dev, + "port %d delivering skb without TX timestamp\n", + port); return; + } if (clone) OCELOT_SKB_CB(skb)->clone = clone; diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 54024b6f9498..be3e42e135c0 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -62,6 +62,7 @@ struct felix { resource_size_t switch_base; resource_size_t imdio_base; enum dsa_tag_protocol tag_proto; + struct kthread_worker *xmit_worker; }; struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port); diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 7c0db80eff00..924c3f129992 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3117,7 +3117,7 @@ static void sja1105_teardown(struct dsa_switch *ds) sja1105_static_config_free(&priv->static_config); } -const struct dsa_switch_ops sja1105_switch_ops = { +static const struct dsa_switch_ops sja1105_switch_ops = { .get_tag_protocol = sja1105_get_tag_protocol, .setup = sja1105_setup, .teardown = sja1105_teardown, @@ -3166,7 +3166,6 @@ const struct dsa_switch_ops sja1105_switch_ops = { .port_bridge_tx_fwd_offload = dsa_tag_8021q_bridge_tx_fwd_offload, .port_bridge_tx_fwd_unoffload = dsa_tag_8021q_bridge_tx_fwd_unoffload, }; -EXPORT_SYMBOL_GPL(sja1105_switch_ops); static const struct of_device_id sja1105_dt_ids[]; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 691f6dd7e669..54396992a919 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -64,6 +64,7 @@ enum sja1105_ptp_clk_mode { static int sja1105_change_rxtstamping(struct sja1105_private *priv, bool on) { + struct sja1105_tagger_data *tagger_data = &priv->tagger_data; struct sja1105_ptp_data *ptp_data = &priv->ptp_data; struct sja1105_general_params_entry *general_params; struct sja1105_table *table; @@ -79,7 +80,7 @@ static int sja1105_change_rxtstamping(struct sja1105_private *priv, priv->tagger_data.stampable_skb = NULL; } ptp_cancel_worker_sync(ptp_data->clock); - skb_queue_purge(&ptp_data->skb_txtstamp_queue); + skb_queue_purge(&tagger_data->skb_txtstamp_queue); skb_queue_purge(&ptp_data->skb_rxtstamp_queue); return sja1105_static_config_reload(priv, SJA1105_RX_HWTSTAMPING); @@ -452,40 +453,6 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, return priv->info->rxtstamp(ds, port, skb); } -void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, - enum sja1110_meta_tstamp dir, u64 tstamp) -{ - struct sja1105_private *priv = ds->priv; - struct sja1105_ptp_data *ptp_data = &priv->ptp_data; - struct sk_buff *skb, *skb_tmp, *skb_match = NULL; - struct skb_shared_hwtstamps shwt = {0}; - - /* We don't care about RX timestamps on the CPU port */ - if (dir == SJA1110_META_TSTAMP_RX) - return; - - spin_lock(&ptp_data->skb_txtstamp_queue.lock); - - skb_queue_walk_safe(&ptp_data->skb_txtstamp_queue, skb, skb_tmp) { - if (SJA1105_SKB_CB(skb)->ts_id != ts_id) - continue; - - __skb_unlink(skb, &ptp_data->skb_txtstamp_queue); - skb_match = skb; - - break; - } - - spin_unlock(&ptp_data->skb_txtstamp_queue.lock); - - if (WARN_ON(!skb_match)) - return; - - shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp)); - skb_complete_tx_timestamp(skb_match, &shwt); -} -EXPORT_SYMBOL_GPL(sja1110_process_meta_tstamp); - /* In addition to cloning the skb which is done by the common * sja1105_port_txtstamp, we need to generate a timestamp ID and save the * packet to the TX timestamping queue. @@ -494,7 +461,6 @@ void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) { struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; struct sja1105_private *priv = ds->priv; - struct sja1105_ptp_data *ptp_data = &priv->ptp_data; struct sja1105_port *sp = &priv->ports[port]; u8 ts_id; @@ -510,7 +476,7 @@ void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) spin_unlock(&sp->data->meta_lock); - skb_queue_tail(&ptp_data->skb_txtstamp_queue, clone); + skb_queue_tail(&sp->data->skb_txtstamp_queue, clone); } /* Called from dsa_skb_tx_timestamp. This callback is just to clone @@ -953,7 +919,7 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) /* Only used on SJA1105 */ skb_queue_head_init(&ptp_data->skb_rxtstamp_queue); /* Only used on SJA1110 */ - skb_queue_head_init(&ptp_data->skb_txtstamp_queue); + skb_queue_head_init(&tagger_data->skb_txtstamp_queue); spin_lock_init(&tagger_data->meta_lock); ptp_data->clock = ptp_clock_register(&ptp_data->caps, ds->dev); @@ -971,6 +937,7 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) void sja1105_ptp_clock_unregister(struct dsa_switch *ds) { struct sja1105_private *priv = ds->priv; + struct sja1105_tagger_data *tagger_data = &priv->tagger_data; struct sja1105_ptp_data *ptp_data = &priv->ptp_data; if (IS_ERR_OR_NULL(ptp_data->clock)) @@ -978,7 +945,7 @@ void sja1105_ptp_clock_unregister(struct dsa_switch *ds) del_timer_sync(&ptp_data->extts_timer); ptp_cancel_worker_sync(ptp_data->clock); - skb_queue_purge(&ptp_data->skb_txtstamp_queue); + skb_queue_purge(&tagger_data->skb_txtstamp_queue); skb_queue_purge(&ptp_data->skb_rxtstamp_queue); ptp_clock_unregister(ptp_data->clock); ptp_data->clock = NULL; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h index 3c874bb4c17b..3ae6b9fdd492 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.h +++ b/drivers/net/dsa/sja1105/sja1105_ptp.h @@ -8,21 +8,6 @@ #if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) -/* Timestamps are in units of 8 ns clock ticks (equivalent to - * a fixed 125 MHz clock). - */ -#define SJA1105_TICK_NS 8 - -static inline s64 ns_to_sja1105_ticks(s64 ns) -{ - return ns / SJA1105_TICK_NS; -} - -static inline s64 sja1105_ticks_to_ns(s64 ticks) -{ - return ticks * SJA1105_TICK_NS; -} - /* Calculate the first base_time in the future that satisfies this * relationship: * @@ -77,10 +62,6 @@ struct sja1105_ptp_data { struct timer_list extts_timer; /* Used only on SJA1105 to reconstruct partial timestamps */ struct sk_buff_head skb_rxtstamp_queue; - /* Used on SJA1110 where meta frames are generated only for - * 2-step TX timestamps - */ - struct sk_buff_head skb_txtstamp_queue; struct ptp_clock_info caps; struct ptp_clock *clock; struct sja1105_ptp_cmd cmd; diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index d796684ec9ca..412ae3e43ffb 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -100,6 +100,7 @@ config JME config KORINA tristate "Korina (IDT RC32434) Ethernet support" depends on MIKROTIK_RB532 || COMPILE_TEST + select CRC32 select MII help If you have a Mikrotik RouterBoard 500 or IDT RC32434 diff --git a/drivers/net/ethernet/arc/Kconfig b/drivers/net/ethernet/arc/Kconfig index 37a41773dd43..92a79c4ffa2c 100644 --- a/drivers/net/ethernet/arc/Kconfig +++ b/drivers/net/ethernet/arc/Kconfig @@ -21,6 +21,7 @@ config ARC_EMAC_CORE depends on ARC || ARCH_ROCKCHIP || COMPILE_TEST select MII select PHYLIB + select CRC32 config ARC_EMAC tristate "ARC EMAC support" diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c index 691e1475d55e..0fbecd093fa1 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_main.c +++ b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -1193,7 +1193,7 @@ static int nic_register_interrupts(struct nicpf *nic) dev_err(&nic->pdev->dev, "Request for #%d msix vectors failed, returned %d\n", nic->num_vec, ret); - return 1; + return ret; } /* Register mailbox interrupt handler */ diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index d1667b759522..a27227aeae88 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1224,7 +1224,7 @@ static int nicvf_register_misc_interrupt(struct nicvf *nic) if (ret < 0) { netdev_err(nic->netdev, "Req for #%d msix vectors failed\n", nic->num_vec); - return 1; + return ret; } sprintf(nic->irq_name[irq], "%s Mbox", "NICVF"); @@ -1243,7 +1243,7 @@ static int nicvf_register_misc_interrupt(struct nicvf *nic) if (!nicvf_check_pf_ready(nic)) { nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0); nicvf_unregister_interrupts(nic); - return 1; + return -EIO; } return 0; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index 9690e36e9e85..910b9f722504 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -157,7 +157,7 @@ static const struct { { ENETC_PM0_TFRM, "MAC tx frames" }, { ENETC_PM0_TFCS, "MAC tx fcs errors" }, { ENETC_PM0_TVLAN, "MAC tx VLAN frames" }, - { ENETC_PM0_TERR, "MAC tx frames" }, + { ENETC_PM0_TERR, "MAC tx frame errors" }, { ENETC_PM0_TUCA, "MAC tx unicast frames" }, { ENETC_PM0_TMCA, "MAC tx multicast frames" }, { ENETC_PM0_TBCA, "MAC tx broadcast frames" }, diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 4c977dfc44f0..d522bd5c90b4 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -517,10 +517,13 @@ static void enetc_port_si_configure(struct enetc_si *si) static void enetc_configure_port_mac(struct enetc_hw *hw) { + int tc; + enetc_port_wr(hw, ENETC_PM0_MAXFRM, ENETC_SET_MAXFRM(ENETC_RX_MAXFRM_SIZE)); - enetc_port_wr(hw, ENETC_PTCMSDUR(0), ENETC_MAC_MAXFRM_SIZE); + for (tc = 0; tc < 8; tc++) + enetc_port_wr(hw, ENETC_PTCMSDUR(tc), ENETC_MAC_MAXFRM_SIZE); enetc_port_wr(hw, ENETC_PM0_CMD_CFG, ENETC_PM0_CMD_PHY_TX_EN | ENETC_PM0_CMD_TXP | ENETC_PM0_PROMISC); diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 1d3188e8e3b3..92dc18a4bcc4 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -780,7 +780,7 @@ struct gve_queue_page_list *gve_assign_rx_qpl(struct gve_priv *priv) gve_num_tx_qpls(priv)); /* we are out of rx qpls */ - if (id == priv->qpl_cfg.qpl_map_size) + if (id == gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv)) return NULL; set_bit(id, priv->qpl_cfg.qpl_id_map); diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 099a2bc5ae67..bf8a4a7c43f7 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -41,6 +41,7 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) { struct gve_priv *priv = netdev_priv(dev); unsigned int start; + u64 packets, bytes; int ring; if (priv->rx) { @@ -48,10 +49,12 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) do { start = u64_stats_fetch_begin(&priv->rx[ring].statss); - s->rx_packets += priv->rx[ring].rpackets; - s->rx_bytes += priv->rx[ring].rbytes; + packets = priv->rx[ring].rpackets; + bytes = priv->rx[ring].rbytes; } while (u64_stats_fetch_retry(&priv->rx[ring].statss, start)); + s->rx_packets += packets; + s->rx_bytes += bytes; } } if (priv->tx) { @@ -59,10 +62,12 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) do { start = u64_stats_fetch_begin(&priv->tx[ring].statss); - s->tx_packets += priv->tx[ring].pkt_done; - s->tx_bytes += priv->tx[ring].bytes_done; + packets = priv->tx[ring].pkt_done; + bytes = priv->tx[ring].bytes_done; } while (u64_stats_fetch_retry(&priv->tx[ring].statss, start)); + s->tx_packets += packets; + s->tx_bytes += bytes; } } } @@ -82,6 +87,9 @@ static int gve_alloc_counter_array(struct gve_priv *priv) static void gve_free_counter_array(struct gve_priv *priv) { + if (!priv->counter_array) + return; + dma_free_coherent(&priv->pdev->dev, priv->num_event_counters * sizeof(*priv->counter_array), @@ -142,6 +150,9 @@ static int gve_alloc_stats_report(struct gve_priv *priv) static void gve_free_stats_report(struct gve_priv *priv) { + if (!priv->stats_report) + return; + del_timer_sync(&priv->stats_report_timer); dma_free_coherent(&priv->pdev->dev, priv->stats_report_len, priv->stats_report, priv->stats_report_bus); @@ -370,18 +381,19 @@ static void gve_free_notify_blocks(struct gve_priv *priv) { int i; - if (priv->msix_vectors) { - /* Free the irqs */ - for (i = 0; i < priv->num_ntfy_blks; i++) { - struct gve_notify_block *block = &priv->ntfy_blocks[i]; - int msix_idx = i; + if (!priv->msix_vectors) + return; - irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, - NULL); - free_irq(priv->msix_vectors[msix_idx].vector, block); - } - free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); + /* Free the irqs */ + for (i = 0; i < priv->num_ntfy_blks; i++) { + struct gve_notify_block *block = &priv->ntfy_blocks[i]; + int msix_idx = i; + + irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, + NULL); + free_irq(priv->msix_vectors[msix_idx].vector, block); } + free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * sizeof(*priv->ntfy_blocks), priv->ntfy_blocks, priv->ntfy_block_bus); @@ -1185,9 +1197,10 @@ static void gve_handle_reset(struct gve_priv *priv) void gve_handle_report_stats(struct gve_priv *priv) { - int idx, stats_idx = 0, tx_bytes; - unsigned int start = 0; struct stats *stats = priv->stats_report->stats; + int idx, stats_idx = 0; + unsigned int start = 0; + u64 tx_bytes; if (!gve_get_report_stats(priv)) return; diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index bb8261368250..94941d4e4744 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -104,8 +104,14 @@ static int gve_prefill_rx_pages(struct gve_rx_ring *rx) if (!rx->data.page_info) return -ENOMEM; - if (!rx->data.raw_addressing) + if (!rx->data.raw_addressing) { rx->data.qpl = gve_assign_rx_qpl(priv); + if (!rx->data.qpl) { + kvfree(rx->data.page_info); + rx->data.page_info = NULL; + return -ENOMEM; + } + } for (i = 0; i < slots; i++) { if (!rx->data.raw_addressing) { struct page *page = rx->data.qpl->pages[i]; diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index eef1b2764d34..67b0bf310daa 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -10,6 +10,27 @@ static LIST_HEAD(hnae3_ae_algo_list); static LIST_HEAD(hnae3_client_list); static LIST_HEAD(hnae3_ae_dev_list); +void hnae3_unregister_ae_algo_prepare(struct hnae3_ae_algo *ae_algo) +{ + const struct pci_device_id *pci_id; + struct hnae3_ae_dev *ae_dev; + + if (!ae_algo) + return; + + list_for_each_entry(ae_dev, &hnae3_ae_dev_list, node) { + if (!hnae3_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B)) + continue; + + pci_id = pci_match_id(ae_algo->pdev_id_table, ae_dev->pdev); + if (!pci_id) + continue; + if (IS_ENABLED(CONFIG_PCI_IOV)) + pci_disable_sriov(ae_dev->pdev); + } +} +EXPORT_SYMBOL(hnae3_unregister_ae_algo_prepare); + /* we are keeping things simple and using single lock for all the * list. This is a non-critical code so other updations, if happen * in parallel, can wait. diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 8ba21d6dc220..d701451596c8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -853,6 +853,7 @@ struct hnae3_handle { int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev); void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev); +void hnae3_unregister_ae_algo_prepare(struct hnae3_ae_algo *ae_algo); void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo); void hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 2b66c59f5eaf..e54f96251fea 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -137,7 +137,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = { .name = "uc", .cmd = HNAE3_DBG_CMD_MAC_UC, .dentry = HNS3_DBG_DENTRY_MAC, - .buf_len = HNS3_DBG_READ_LEN, + .buf_len = HNS3_DBG_READ_LEN_128KB, .init = hns3_dbg_common_file_init, }, { @@ -256,7 +256,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = { .name = "tqp", .cmd = HNAE3_DBG_CMD_REG_TQP, .dentry = HNS3_DBG_DENTRY_REG, - .buf_len = HNS3_DBG_READ_LEN, + .buf_len = HNS3_DBG_READ_LEN_128KB, .init = hns3_dbg_common_file_init, }, { @@ -298,7 +298,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = { .name = "fd_tcam", .cmd = HNAE3_DBG_CMD_FD_TCAM, .dentry = HNS3_DBG_DENTRY_FD, - .buf_len = HNS3_DBG_READ_LEN, + .buf_len = HNS3_DBG_READ_LEN_1MB, .init = hns3_dbg_common_file_init, }, { @@ -462,7 +462,7 @@ static const struct hns3_dbg_item rx_queue_info_items[] = { { "TAIL", 2 }, { "HEAD", 2 }, { "FBDNUM", 2 }, - { "PKTNUM", 2 }, + { "PKTNUM", 5 }, { "COPYBREAK", 2 }, { "RING_EN", 2 }, { "RX_RING_EN", 2 }, @@ -565,7 +565,7 @@ static const struct hns3_dbg_item tx_queue_info_items[] = { { "HEAD", 2 }, { "FBDNUM", 2 }, { "OFFSET", 2 }, - { "PKTNUM", 2 }, + { "PKTNUM", 5 }, { "RING_EN", 2 }, { "TX_RING_EN", 2 }, { "BASE_ADDR", 10 }, @@ -790,13 +790,13 @@ static int hns3_dbg_rx_bd_info(struct hns3_dbg_data *d, char *buf, int len) } static const struct hns3_dbg_item tx_bd_info_items[] = { - { "BD_IDX", 5 }, - { "ADDRESS", 2 }, + { "BD_IDX", 2 }, + { "ADDRESS", 13 }, { "VLAN_TAG", 2 }, { "SIZE", 2 }, { "T_CS_VLAN_TSO", 2 }, { "OT_VLAN_TAG", 3 }, - { "TV", 2 }, + { "TV", 5 }, { "OLT_VLAN_LEN", 2 }, { "PAYLEN_OL4CS", 2 }, { "BD_FE_SC_VLD", 2 }, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 468b8f07bf47..4b886a13e079 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1847,7 +1847,6 @@ void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size) static int hns3_skb_linearize(struct hns3_enet_ring *ring, struct sk_buff *skb, - u8 max_non_tso_bd_num, unsigned int bd_num) { /* 'bd_num == UINT_MAX' means the skb' fraglist has a @@ -1864,8 +1863,7 @@ static int hns3_skb_linearize(struct hns3_enet_ring *ring, * will not help. */ if (skb->len > HNS3_MAX_TSO_SIZE || - (!skb_is_gso(skb) && skb->len > - HNS3_MAX_NON_TSO_SIZE(max_non_tso_bd_num))) { + (!skb_is_gso(skb) && skb->len > HNS3_MAX_NON_TSO_SIZE)) { u64_stats_update_begin(&ring->syncp); ring->stats.hw_limitation++; u64_stats_update_end(&ring->syncp); @@ -1900,8 +1898,7 @@ static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring, goto out; } - if (hns3_skb_linearize(ring, skb, max_non_tso_bd_num, - bd_num)) + if (hns3_skb_linearize(ring, skb, bd_num)) return -ENOMEM; bd_num = hns3_tx_bd_count(skb->len); @@ -3258,6 +3255,7 @@ static void hns3_buffer_detach(struct hns3_enet_ring *ring, int i) { hns3_unmap_buffer(ring, &ring->desc_cb[i]); ring->desc[i].addr = 0; + ring->desc_cb[i].refill = 0; } static void hns3_free_buffer_detach(struct hns3_enet_ring *ring, int i, @@ -3336,6 +3334,7 @@ static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i) ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ring->desc_cb[i].page_offset); + ring->desc_cb[i].refill = 1; return 0; } @@ -3365,6 +3364,7 @@ static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i, { hns3_unmap_buffer(ring, &ring->desc_cb[i]); ring->desc_cb[i] = *res_cb; + ring->desc_cb[i].refill = 1; ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ring->desc_cb[i].page_offset); ring->desc[i].rx.bd_base_info = 0; @@ -3373,6 +3373,7 @@ static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i, static void hns3_reuse_buffer(struct hns3_enet_ring *ring, int i) { ring->desc_cb[i].reuse_flag = 0; + ring->desc_cb[i].refill = 1; ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ring->desc_cb[i].page_offset); ring->desc[i].rx.bd_base_info = 0; @@ -3479,10 +3480,14 @@ static int hns3_desc_unused(struct hns3_enet_ring *ring) int ntc = ring->next_to_clean; int ntu = ring->next_to_use; + if (unlikely(ntc == ntu && !ring->desc_cb[ntc].refill)) + return ring->desc_num; + return ((ntc >= ntu) ? 0 : ring->desc_num) + ntc - ntu; } -static void hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, +/* Return true if there is any allocation failure */ +static bool hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, int cleand_count) { struct hns3_desc_cb *desc_cb; @@ -3507,7 +3512,10 @@ static void hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, hns3_rl_err(ring_to_netdev(ring), "alloc rx buffer failed: %d\n", ret); - break; + + writel(i, ring->tqp->io_base + + HNS3_RING_RX_RING_HEAD_REG); + return true; } hns3_replace_buffer(ring, ring->next_to_use, &res_cbs); @@ -3520,6 +3528,7 @@ static void hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, } writel(i, ring->tqp->io_base + HNS3_RING_RX_RING_HEAD_REG); + return false; } static bool hns3_can_reuse_page(struct hns3_desc_cb *cb) @@ -3824,6 +3833,7 @@ static void hns3_rx_ring_move_fw(struct hns3_enet_ring *ring) { ring->desc[ring->next_to_clean].rx.bd_base_info &= cpu_to_le32(~BIT(HNS3_RXD_VLD_B)); + ring->desc_cb[ring->next_to_clean].refill = 0; ring->next_to_clean += 1; if (unlikely(ring->next_to_clean == ring->desc_num)) @@ -4170,6 +4180,7 @@ int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget, { #define RCB_NOF_ALLOC_RX_BUFF_ONCE 16 int unused_count = hns3_desc_unused(ring); + bool failure = false; int recv_pkts = 0; int err; @@ -4178,9 +4189,9 @@ int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget, while (recv_pkts < budget) { /* Reuse or realloc buffers */ if (unused_count >= RCB_NOF_ALLOC_RX_BUFF_ONCE) { - hns3_nic_alloc_rx_buffers(ring, unused_count); - unused_count = hns3_desc_unused(ring) - - ring->pending_buf; + failure = failure || + hns3_nic_alloc_rx_buffers(ring, unused_count); + unused_count = 0; } /* Poll one pkt */ @@ -4199,11 +4210,7 @@ int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget, } out: - /* Make all data has been write before submit */ - if (unused_count > 0) - hns3_nic_alloc_rx_buffers(ring, unused_count); - - return recv_pkts; + return failure ? budget : recv_pkts; } static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 6162d9f88e37..f09a61d9c626 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -186,11 +186,9 @@ enum hns3_nic_state { #define HNS3_MAX_BD_SIZE 65535 #define HNS3_MAX_TSO_BD_NUM 63U -#define HNS3_MAX_TSO_SIZE \ - (HNS3_MAX_BD_SIZE * HNS3_MAX_TSO_BD_NUM) +#define HNS3_MAX_TSO_SIZE 1048576U +#define HNS3_MAX_NON_TSO_SIZE 9728U -#define HNS3_MAX_NON_TSO_SIZE(max_non_tso_bd_num) \ - (HNS3_MAX_BD_SIZE * (max_non_tso_bd_num)) #define HNS3_VECTOR_GL0_OFFSET 0x100 #define HNS3_VECTOR_GL1_OFFSET 0x200 @@ -332,6 +330,7 @@ struct hns3_desc_cb { u32 length; /* length of the buffer */ u16 reuse_flag; + u16 refill; /* desc type, used by the ring user to mark the type of the priv data */ u16 type; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 307c9e830510..91cb578f56b8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -137,6 +137,15 @@ static int hclge_ets_sch_mode_validate(struct hclge_dev *hdev, *changed = true; break; case IEEE_8021QAZ_TSA_ETS: + /* The hardware will switch to sp mode if bandwidth is + * 0, so limit ets bandwidth must be greater than 0. + */ + if (!ets->tc_tx_bw[i]) { + dev_err(&hdev->pdev->dev, + "tc%u ets bw cannot be 0\n", i); + return -EINVAL; + } + if (hdev->tm_info.tc_info[i].tc_sch_mode != HCLGE_SCH_MODE_DWRR) *changed = true; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 32f62cd2dd99..9cda8b3562b8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -391,7 +391,7 @@ static int hclge_dbg_dump_mac(struct hclge_dev *hdev, char *buf, int len) static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len, int *pos) { - struct hclge_dbg_bitmap_cmd *bitmap; + struct hclge_dbg_bitmap_cmd req; struct hclge_desc desc; u16 qset_id, qset_num; int ret; @@ -408,12 +408,12 @@ static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len, if (ret) return ret; - bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1]; + req.bitmap = (u8)le32_to_cpu(desc.data[1]); *pos += scnprintf(buf + *pos, len - *pos, "%04u %#x %#x %#x %#x\n", - qset_id, bitmap->bit0, bitmap->bit1, - bitmap->bit2, bitmap->bit3); + qset_id, req.bit0, req.bit1, req.bit2, + req.bit3); } return 0; @@ -422,7 +422,7 @@ static int hclge_dbg_dump_dcb_qset(struct hclge_dev *hdev, char *buf, int len, static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len, int *pos) { - struct hclge_dbg_bitmap_cmd *bitmap; + struct hclge_dbg_bitmap_cmd req; struct hclge_desc desc; u8 pri_id, pri_num; int ret; @@ -439,12 +439,11 @@ static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len, if (ret) return ret; - bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1]; + req.bitmap = (u8)le32_to_cpu(desc.data[1]); *pos += scnprintf(buf + *pos, len - *pos, "%03u %#x %#x %#x\n", - pri_id, bitmap->bit0, bitmap->bit1, - bitmap->bit2); + pri_id, req.bit0, req.bit1, req.bit2); } return 0; @@ -453,7 +452,7 @@ static int hclge_dbg_dump_dcb_pri(struct hclge_dev *hdev, char *buf, int len, static int hclge_dbg_dump_dcb_pg(struct hclge_dev *hdev, char *buf, int len, int *pos) { - struct hclge_dbg_bitmap_cmd *bitmap; + struct hclge_dbg_bitmap_cmd req; struct hclge_desc desc; u8 pg_id; int ret; @@ -466,12 +465,11 @@ static int hclge_dbg_dump_dcb_pg(struct hclge_dev *hdev, char *buf, int len, if (ret) return ret; - bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1]; + req.bitmap = (u8)le32_to_cpu(desc.data[1]); *pos += scnprintf(buf + *pos, len - *pos, "%03u %#x %#x %#x\n", - pg_id, bitmap->bit0, bitmap->bit1, - bitmap->bit2); + pg_id, req.bit0, req.bit1, req.bit2); } return 0; @@ -511,7 +509,7 @@ static int hclge_dbg_dump_dcb_queue(struct hclge_dev *hdev, char *buf, int len, static int hclge_dbg_dump_dcb_port(struct hclge_dev *hdev, char *buf, int len, int *pos) { - struct hclge_dbg_bitmap_cmd *bitmap; + struct hclge_dbg_bitmap_cmd req; struct hclge_desc desc; u8 port_id = 0; int ret; @@ -521,12 +519,12 @@ static int hclge_dbg_dump_dcb_port(struct hclge_dev *hdev, char *buf, int len, if (ret) return ret; - bitmap = (struct hclge_dbg_bitmap_cmd *)&desc.data[1]; + req.bitmap = (u8)le32_to_cpu(desc.data[1]); *pos += scnprintf(buf + *pos, len - *pos, "port_mask: %#x\n", - bitmap->bit0); + req.bit0); *pos += scnprintf(buf + *pos, len - *pos, "port_shaping_pass: %#x\n", - bitmap->bit1); + req.bit1); return 0; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index bb9b026ae88e..93aa7f2bdc13 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -1560,8 +1560,11 @@ static int hclge_config_tm_hw_err_int(struct hclge_dev *hdev, bool en) /* configure TM QCN hw errors */ hclge_cmd_setup_basic_desc(&desc, HCLGE_TM_QCN_MEM_INT_CFG, false); - if (en) + desc.data[0] = cpu_to_le32(HCLGE_TM_QCN_ERR_INT_TYPE); + if (en) { + desc.data[0] |= cpu_to_le32(HCLGE_TM_QCN_FIFO_INT_EN); desc.data[1] = cpu_to_le32(HCLGE_TM_QCN_MEM_ERR_INT_EN); + } ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index 07987fb8332e..d811eeefe2c0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -50,6 +50,8 @@ #define HCLGE_PPP_MPF_ECC_ERR_INT3_EN 0x003F #define HCLGE_PPP_MPF_ECC_ERR_INT3_EN_MASK 0x003F #define HCLGE_TM_SCH_ECC_ERR_INT_EN 0x3 +#define HCLGE_TM_QCN_ERR_INT_TYPE 0x29 +#define HCLGE_TM_QCN_FIFO_INT_EN 0xFFFF00 #define HCLGE_TM_QCN_MEM_ERR_INT_EN 0xFFFFFF #define HCLGE_NCSI_ERR_INT_EN 0x3 #define HCLGE_NCSI_ERR_INT_TYPE 0x9 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index f5b8d1fee0f1..d891390d492f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2847,33 +2847,29 @@ static void hclge_mbx_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) && !test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, &hdev->state)) - mod_delayed_work_on(cpumask_first(&hdev->affinity_mask), - hclge_wq, &hdev->service_task, 0); + mod_delayed_work(hclge_wq, &hdev->service_task, 0); } static void hclge_reset_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) && + test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state) && !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state)) - mod_delayed_work_on(cpumask_first(&hdev->affinity_mask), - hclge_wq, &hdev->service_task, 0); + mod_delayed_work(hclge_wq, &hdev->service_task, 0); } static void hclge_errhand_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) && !test_and_set_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state)) - mod_delayed_work_on(cpumask_first(&hdev->affinity_mask), - hclge_wq, &hdev->service_task, 0); + mod_delayed_work(hclge_wq, &hdev->service_task, 0); } void hclge_task_schedule(struct hclge_dev *hdev, unsigned long delay_time) { if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) && !test_bit(HCLGE_STATE_RST_FAIL, &hdev->state)) - mod_delayed_work_on(cpumask_first(&hdev->affinity_mask), - hclge_wq, &hdev->service_task, - delay_time); + mod_delayed_work(hclge_wq, &hdev->service_task, delay_time); } static int hclge_get_mac_link_status(struct hclge_dev *hdev, int *link_status) @@ -3491,33 +3487,14 @@ static void hclge_get_misc_vector(struct hclge_dev *hdev) hdev->num_msi_used += 1; } -static void hclge_irq_affinity_notify(struct irq_affinity_notify *notify, - const cpumask_t *mask) -{ - struct hclge_dev *hdev = container_of(notify, struct hclge_dev, - affinity_notify); - - cpumask_copy(&hdev->affinity_mask, mask); -} - -static void hclge_irq_affinity_release(struct kref *ref) -{ -} - static void hclge_misc_affinity_setup(struct hclge_dev *hdev) { irq_set_affinity_hint(hdev->misc_vector.vector_irq, &hdev->affinity_mask); - - hdev->affinity_notify.notify = hclge_irq_affinity_notify; - hdev->affinity_notify.release = hclge_irq_affinity_release; - irq_set_affinity_notifier(hdev->misc_vector.vector_irq, - &hdev->affinity_notify); } static void hclge_misc_affinity_teardown(struct hclge_dev *hdev) { - irq_set_affinity_notifier(hdev->misc_vector.vector_irq, NULL); irq_set_affinity_hint(hdev->misc_vector.vector_irq, NULL); } @@ -13052,7 +13029,7 @@ static int hclge_init(void) { pr_info("%s is initializing\n", HCLGE_NAME); - hclge_wq = alloc_workqueue("%s", 0, 0, HCLGE_NAME); + hclge_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, HCLGE_NAME); if (!hclge_wq) { pr_err("%s: failed to create workqueue\n", HCLGE_NAME); return -ENOMEM; @@ -13065,6 +13042,7 @@ static int hclge_init(void) static void hclge_exit(void) { + hnae3_unregister_ae_algo_prepare(&ae_algo); hnae3_unregister_ae_algo(&ae_algo); destroy_workqueue(hclge_wq); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index de6afbcbfbac..69cd8f87b4c8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -944,7 +944,6 @@ struct hclge_dev { /* affinity mask and notify for misc interrupt */ cpumask_t affinity_mask; - struct irq_affinity_notify affinity_notify; struct hclge_ptp *ptp; struct devlink *devlink; }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index f314dbd3ce11..95074e91a846 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -752,6 +752,8 @@ static void hclge_tm_pg_info_init(struct hclge_dev *hdev) hdev->tm_info.pg_info[i].tc_bit_map = hdev->hw_tc_map; for (k = 0; k < hdev->tm_info.num_tc; k++) hdev->tm_info.pg_info[i].tc_dwrr[k] = BW_PERCENT; + for (; k < HNAE3_MAX_TC; k++) + hdev->tm_info.pg_info[i].tc_dwrr[k] = 0; } } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 5fdac8685f95..cf00ad7bb881 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2232,6 +2232,7 @@ static void hclgevf_get_misc_vector(struct hclgevf_dev *hdev) void hclgevf_reset_task_schedule(struct hclgevf_dev *hdev) { if (!test_bit(HCLGEVF_STATE_REMOVING, &hdev->state) && + test_bit(HCLGEVF_STATE_SERVICE_INITED, &hdev->state) && !test_and_set_bit(HCLGEVF_STATE_RST_SERVICE_SCHED, &hdev->state)) mod_delayed_work(hclgevf_wq, &hdev->service_task, 0); @@ -2273,9 +2274,9 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) hdev->reset_attempts = 0; hdev->last_reset_time = jiffies; - while ((hdev->reset_type = - hclgevf_get_reset_level(hdev, &hdev->reset_pending)) - != HNAE3_NONE_RESET) + hdev->reset_type = + hclgevf_get_reset_level(hdev, &hdev->reset_pending); + if (hdev->reset_type != HNAE3_NONE_RESET) hclgevf_reset(hdev); } else if (test_and_clear_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state)) { @@ -3449,6 +3450,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) hclgevf_init_rxd_adv_layout(hdev); + set_bit(HCLGEVF_STATE_SERVICE_INITED, &hdev->state); + hdev->last_reset_time = jiffies; dev_info(&hdev->pdev->dev, "finished initializing %s driver\n", HCLGEVF_DRIVER_NAME); @@ -3899,7 +3902,7 @@ static int hclgevf_init(void) { pr_info("%s is initializing\n", HCLGEVF_NAME); - hclgevf_wq = alloc_workqueue("%s", 0, 0, HCLGEVF_NAME); + hclgevf_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, HCLGEVF_NAME); if (!hclgevf_wq) { pr_err("%s: failed to create workqueue\n", HCLGEVF_NAME); return -ENOMEM; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 883130a9b48f..28288d7e3303 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -146,6 +146,7 @@ enum hclgevf_states { HCLGEVF_STATE_REMOVING, HCLGEVF_STATE_NIC_REGISTERED, HCLGEVF_STATE_ROCE_REGISTERED, + HCLGEVF_STATE_SERVICE_INITED, /* task states */ HCLGEVF_STATE_RST_SERVICE_SCHED, HCLGEVF_STATE_RST_HANDLING, diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 5b2143f4b1f8..3178efd98006 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -113,7 +113,8 @@ enum e1000_boards { board_pch2lan, board_pch_lpt, board_pch_spt, - board_pch_cnp + board_pch_cnp, + board_pch_tgp }; struct e1000_ps_page { @@ -499,6 +500,7 @@ extern const struct e1000_info e1000_pch2_info; extern const struct e1000_info e1000_pch_lpt_info; extern const struct e1000_info e1000_pch_spt_info; extern const struct e1000_info e1000_pch_cnp_info; +extern const struct e1000_info e1000_pch_tgp_info; extern const struct e1000_info e1000_es2_info; void e1000e_ptp_init(struct e1000_adapter *adapter); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 60c582a16821..5e4fc9b4e2ad 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -4813,7 +4813,7 @@ static s32 e1000_reset_hw_ich8lan(struct e1000_hw *hw) static s32 e1000_init_hw_ich8lan(struct e1000_hw *hw) { struct e1000_mac_info *mac = &hw->mac; - u32 ctrl_ext, txdctl, snoop; + u32 ctrl_ext, txdctl, snoop, fflt_dbg; s32 ret_val; u16 i; @@ -4872,6 +4872,15 @@ static s32 e1000_init_hw_ich8lan(struct e1000_hw *hw) snoop = (u32)~(PCIE_NO_SNOOP_ALL); e1000e_set_pcie_no_snoop(hw, snoop); + /* Enable workaround for packet loss issue on TGP PCH + * Do not gate DMA clock from the modPHY block + */ + if (mac->type >= e1000_pch_tgp) { + fflt_dbg = er32(FFLT_DBG); + fflt_dbg |= E1000_FFLT_DBG_DONT_GATE_WAKE_DMA_CLK; + ew32(FFLT_DBG, fflt_dbg); + } + ctrl_ext = er32(CTRL_EXT); ctrl_ext |= E1000_CTRL_EXT_RO_DIS; ew32(CTRL_EXT, ctrl_ext); @@ -5992,3 +6001,23 @@ const struct e1000_info e1000_pch_cnp_info = { .phy_ops = &ich8_phy_ops, .nvm_ops = &spt_nvm_ops, }; + +const struct e1000_info e1000_pch_tgp_info = { + .mac = e1000_pch_tgp, + .flags = FLAG_IS_ICH + | FLAG_HAS_WOL + | FLAG_HAS_HW_TIMESTAMP + | FLAG_HAS_CTRLEXT_ON_LOAD + | FLAG_HAS_AMT + | FLAG_HAS_FLASH + | FLAG_HAS_JUMBO_FRAMES + | FLAG_APME_IN_WUC, + .flags2 = FLAG2_HAS_PHY_STATS + | FLAG2_HAS_EEE, + .pba = 26, + .max_hw_frame_size = 9022, + .get_variants = e1000_get_variants_ich8lan, + .mac_ops = &ich8_mac_ops, + .phy_ops = &ich8_phy_ops, + .nvm_ops = &spt_nvm_ops, +}; diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h index d6a092e5ee74..2504b11c3169 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.h +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h @@ -289,6 +289,9 @@ /* Proprietary Latency Tolerance Reporting PCI Capability */ #define E1000_PCI_LTR_CAP_LPT 0xA8 +/* Don't gate wake DMA clock */ +#define E1000_FFLT_DBG_DONT_GATE_WAKE_DMA_CLK 0x1000 + void e1000e_write_protect_nvm_ich8lan(struct e1000_hw *hw); void e1000e_set_kmrn_lock_loss_workaround_ich8lan(struct e1000_hw *hw, bool state); diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 900b3ab998bd..ebcb2a30add0 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -51,6 +51,7 @@ static const struct e1000_info *e1000_info_tbl[] = { [board_pch_lpt] = &e1000_pch_lpt_info, [board_pch_spt] = &e1000_pch_spt_info, [board_pch_cnp] = &e1000_pch_cnp_info, + [board_pch_tgp] = &e1000_pch_tgp_info, }; struct e1000_reg_info { @@ -7896,28 +7897,28 @@ static const struct pci_device_id e1000_pci_tbl[] = { { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_CMP_I219_V11), board_pch_cnp }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_CMP_I219_LM12), board_pch_spt }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_CMP_I219_V12), board_pch_spt }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM13), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V13), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM14), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V14), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM15), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V15), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM23), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V23), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM16), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V16), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM17), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V17), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM22), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V22), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM18), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V18), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM19), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V19), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM20), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V20), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM21), board_pch_cnp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V21), board_pch_cnp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM13), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V13), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM14), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V14), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_LM15), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_TGP_I219_V15), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM23), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V23), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM16), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V16), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_LM17), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ADP_I219_V17), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_LM22), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_RPL_I219_V22), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM18), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V18), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_LM19), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_MTP_I219_V19), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM20), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V20), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_LM21), board_pch_tgp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V21), board_pch_tgp }, { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ }; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2f20980dd9a5..e04b540cedc8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4871,7 +4871,8 @@ static void i40e_clear_interrupt_scheme(struct i40e_pf *pf) { int i; - i40e_free_misc_vector(pf); + if (test_bit(__I40E_MISC_IRQ_REQUESTED, pf->state)) + i40e_free_misc_vector(pf); i40e_put_lump(pf->irq_pile, pf->iwarp_base_vector, I40E_IWARP_IRQ_PILE_ID); @@ -10113,7 +10114,7 @@ static int i40e_get_capabilities(struct i40e_pf *pf, if (pf->hw.aq.asq_last_status == I40E_AQ_RC_ENOMEM) { /* retry with a larger buffer */ buf_len = data_size; - } else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK) { + } else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK || err) { dev_info(&pf->pdev->dev, "capability discovery failed, err %s aq_err %s\n", i40e_stat_str(&pf->hw, err), diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 23762a7ef740..cada4e0e40b4 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1965,7 +1965,6 @@ static void iavf_watchdog_task(struct work_struct *work) } adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; - mutex_unlock(&adapter->crit_lock); queue_delayed_work(iavf_wq, &adapter->watchdog_task, msecs_to_jiffies(10)); diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 2fb81e359cdf..df5ad4de1f00 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -25,6 +25,8 @@ static enum ice_status ice_set_mac_type(struct ice_hw *hw) case ICE_DEV_ID_E810C_BACKPLANE: case ICE_DEV_ID_E810C_QSFP: case ICE_DEV_ID_E810C_SFP: + case ICE_DEV_ID_E810_XXV_BACKPLANE: + case ICE_DEV_ID_E810_XXV_QSFP: case ICE_DEV_ID_E810_XXV_SFP: hw->mac_type = ICE_MAC_E810; break; diff --git a/drivers/net/ethernet/intel/ice/ice_devids.h b/drivers/net/ethernet/intel/ice/ice_devids.h index 9d8194671f6a..ef4392e6e244 100644 --- a/drivers/net/ethernet/intel/ice/ice_devids.h +++ b/drivers/net/ethernet/intel/ice/ice_devids.h @@ -21,6 +21,10 @@ #define ICE_DEV_ID_E810C_QSFP 0x1592 /* Intel(R) Ethernet Controller E810-C for SFP */ #define ICE_DEV_ID_E810C_SFP 0x1593 +/* Intel(R) Ethernet Controller E810-XXV for backplane */ +#define ICE_DEV_ID_E810_XXV_BACKPLANE 0x1599 +/* Intel(R) Ethernet Controller E810-XXV for QSFP */ +#define ICE_DEV_ID_E810_XXV_QSFP 0x159A /* Intel(R) Ethernet Controller E810-XXV for SFP */ #define ICE_DEV_ID_E810_XXV_SFP 0x159B /* Intel(R) Ethernet Connection E823-C for backplane */ diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c index 14afce82ef63..da7288bdc9a3 100644 --- a/drivers/net/ethernet/intel/ice/ice_devlink.c +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c @@ -63,7 +63,8 @@ static int ice_info_fw_api(struct ice_pf *pf, struct ice_info_ctx *ctx) { struct ice_hw *hw = &pf->hw; - snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u", hw->api_maj_ver, hw->api_min_ver); + snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", hw->api_maj_ver, + hw->api_min_ver, hw->api_patch); return 0; } diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c index 06ac9badee77..1ac96dc66d0d 100644 --- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c +++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c @@ -1668,7 +1668,7 @@ static u16 ice_tunnel_idx_to_entry(struct ice_hw *hw, enum ice_tunnel_type type, for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++) if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].type == type && - idx--) + idx-- == 0) return i; WARN_ON_ONCE(1); @@ -1828,7 +1828,7 @@ int ice_udp_tunnel_set_port(struct net_device *netdev, unsigned int table, u16 index; tnl_type = ti->type == UDP_TUNNEL_TYPE_VXLAN ? TNL_VXLAN : TNL_GENEVE; - index = ice_tunnel_idx_to_entry(&pf->hw, idx, tnl_type); + index = ice_tunnel_idx_to_entry(&pf->hw, tnl_type, idx); status = ice_create_tunnel(&pf->hw, index, tnl_type, ntohs(ti->port)); if (status) { diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 37c18c66b5c7..e375ac849aec 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -100,9 +100,9 @@ static void ice_display_lag_info(struct ice_lag *lag) */ static void ice_lag_info_event(struct ice_lag *lag, void *ptr) { - struct net_device *event_netdev, *netdev_tmp; struct netdev_notifier_bonding_info *info; struct netdev_bonding_info *bonding_info; + struct net_device *event_netdev; const char *lag_netdev_name; event_netdev = netdev_notifier_info_to_dev(ptr); @@ -123,19 +123,6 @@ static void ice_lag_info_event(struct ice_lag *lag, void *ptr) goto lag_out; } - rcu_read_lock(); - for_each_netdev_in_bond_rcu(lag->upper_netdev, netdev_tmp) { - if (!netif_is_ice(netdev_tmp)) - continue; - - if (netdev_tmp && netdev_tmp != lag->netdev && - lag->peer_netdev != netdev_tmp) { - dev_hold(netdev_tmp); - lag->peer_netdev = netdev_tmp; - } - } - rcu_read_unlock(); - if (bonding_info->slave.state) ice_lag_set_backup(lag); else @@ -319,6 +306,9 @@ ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event, case NETDEV_BONDING_INFO: ice_lag_info_event(lag, ptr); break; + case NETDEV_UNREGISTER: + ice_lag_unlink(lag, ptr); + break; default: break; } diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index dde9802c6c72..b718e196af2a 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2841,6 +2841,7 @@ void ice_napi_del(struct ice_vsi *vsi) */ int ice_vsi_release(struct ice_vsi *vsi) { + enum ice_status err; struct ice_pf *pf; if (!vsi->back) @@ -2912,6 +2913,10 @@ int ice_vsi_release(struct ice_vsi *vsi) ice_fltr_remove_all(vsi); ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx); + err = ice_rm_vsi_rdma_cfg(vsi->port_info, vsi->idx); + if (err) + dev_err(ice_pf_to_dev(vsi->back), "Failed to remove RDMA scheduler config for VSI %u, err %d\n", + vsi->vsi_num, err); ice_vsi_delete(vsi); ice_vsi_free_q_vectors(vsi); @@ -3092,6 +3097,10 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi) prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce); ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx); + ret = ice_rm_vsi_rdma_cfg(vsi->port_info, vsi->idx); + if (ret) + dev_err(ice_pf_to_dev(vsi->back), "Failed to remove RDMA scheduler config for VSI %u, err %d\n", + vsi->vsi_num, ret); ice_vsi_free_q_vectors(vsi); /* SR-IOV determines needed MSIX resources all at once instead of per diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 0d6c143f6653..06fa93e597fb 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4224,6 +4224,9 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) if (!pf) return -ENOMEM; + /* initialize Auxiliary index to invalid value */ + pf->aux_idx = -1; + /* set up for high or low DMA */ err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); if (err) @@ -4615,7 +4618,8 @@ static void ice_remove(struct pci_dev *pdev) ice_aq_cancel_waiting_tasks(pf); ice_unplug_aux_dev(pf); - ida_free(&ice_aux_ida, pf->aux_idx); + if (pf->aux_idx >= 0) + ida_free(&ice_aux_ida, pf->aux_idx); set_bit(ICE_DOWN, pf->state); mutex_destroy(&(&pf->hw)->fdir_fltr_lock); @@ -5016,6 +5020,8 @@ static const struct pci_device_id ice_pci_tbl[] = { { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_BACKPLANE), 0 }, { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_QSFP), 0 }, { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_SFP), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_BACKPLANE), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_QSFP), 0 }, { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_SFP), 0 }, { PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_BACKPLANE), 0 }, { PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_QSFP), 0 }, diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 05cc5870e4ef..d1ef3d48a4b0 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1313,22 +1313,21 @@ ice_ptp_flush_tx_tracker(struct ice_pf *pf, struct ice_ptp_tx *tx) { u8 idx; - spin_lock(&tx->lock); - for (idx = 0; idx < tx->len; idx++) { u8 phy_idx = idx + tx->quad_offset; - /* Clear any potential residual timestamp in the PHY block */ - if (!pf->hw.reset_ongoing) - ice_clear_phy_tstamp(&pf->hw, tx->quad, phy_idx); - + spin_lock(&tx->lock); if (tx->tstamps[idx].skb) { dev_kfree_skb_any(tx->tstamps[idx].skb); tx->tstamps[idx].skb = NULL; } - } + clear_bit(idx, tx->in_use); + spin_unlock(&tx->lock); - spin_unlock(&tx->lock); + /* Clear any potential residual timestamp in the PHY block */ + if (!pf->hw.reset_ongoing) + ice_clear_phy_tstamp(&pf->hw, tx->quad, phy_idx); + } } /** @@ -1572,6 +1571,9 @@ err_kworker: */ void ice_ptp_release(struct ice_pf *pf) { + if (!test_bit(ICE_FLAG_PTP, pf->flags)) + return; + /* Disable timestamping for both Tx and Rx */ ice_ptp_cfg_timestamp(pf, false); diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c index 9f07b6641705..2d9b10277186 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.c +++ b/drivers/net/ethernet/intel/ice/ice_sched.c @@ -2071,6 +2071,19 @@ enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle) } /** + * ice_rm_vsi_rdma_cfg - remove VSI and its RDMA children nodes + * @pi: port information structure + * @vsi_handle: software VSI handle + * + * This function clears the VSI and its RDMA children nodes from scheduler tree + * for all TCs. + */ +enum ice_status ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle) +{ + return ice_sched_rm_vsi_cfg(pi, vsi_handle, ICE_SCHED_NODE_OWNER_RDMA); +} + +/** * ice_get_agg_info - get the aggregator ID * @hw: pointer to the hardware structure * @agg_id: aggregator ID diff --git a/drivers/net/ethernet/intel/ice/ice_sched.h b/drivers/net/ethernet/intel/ice/ice_sched.h index 9beef8f0ec76..fdf7a5882f07 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.h +++ b/drivers/net/ethernet/intel/ice/ice_sched.h @@ -89,6 +89,7 @@ enum ice_status ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs, u8 owner, bool enable); enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle); +enum ice_status ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle); /* Tx scheduler rate limiter functions */ enum ice_status diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h index 4461f8b9a864..4e0203336c6b 100644 --- a/drivers/net/ethernet/intel/igc/igc_hw.h +++ b/drivers/net/ethernet/intel/igc/igc_hw.h @@ -22,8 +22,8 @@ #define IGC_DEV_ID_I220_V 0x15F7 #define IGC_DEV_ID_I225_K 0x3100 #define IGC_DEV_ID_I225_K2 0x3101 +#define IGC_DEV_ID_I226_K 0x3102 #define IGC_DEV_ID_I225_LMVP 0x5502 -#define IGC_DEV_ID_I226_K 0x5504 #define IGC_DEV_ID_I225_IT 0x0D9F #define IGC_DEV_ID_I226_LM 0x125B #define IGC_DEV_ID_I226_V 0x125C diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index 9338765da048..49d822a98ada 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -226,18 +226,85 @@ static const struct file_operations rvu_dbg_##name##_fops = { \ static void print_nix_qsize(struct seq_file *filp, struct rvu_pfvf *pfvf); +static void get_lf_str_list(struct rvu_block block, int pcifunc, + char *lfs) +{ + int lf = 0, seq = 0, len = 0, prev_lf = block.lf.max; + + for_each_set_bit(lf, block.lf.bmap, block.lf.max) { + if (lf >= block.lf.max) + break; + + if (block.fn_map[lf] != pcifunc) + continue; + + if (lf == prev_lf + 1) { + prev_lf = lf; + seq = 1; + continue; + } + + if (seq) + len += sprintf(lfs + len, "-%d,%d", prev_lf, lf); + else + len += (len ? sprintf(lfs + len, ",%d", lf) : + sprintf(lfs + len, "%d", lf)); + + prev_lf = lf; + seq = 0; + } + + if (seq) + len += sprintf(lfs + len, "-%d", prev_lf); + + lfs[len] = '\0'; +} + +static int get_max_column_width(struct rvu *rvu) +{ + int index, pf, vf, lf_str_size = 12, buf_size = 256; + struct rvu_block block; + u16 pcifunc; + char *buf; + + buf = kzalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (pf = 0; pf < rvu->hw->total_pfs; pf++) { + for (vf = 0; vf <= rvu->hw->total_vfs; vf++) { + pcifunc = pf << 10 | vf; + if (!pcifunc) + continue; + + for (index = 0; index < BLK_COUNT; index++) { + block = rvu->hw->block[index]; + if (!strlen(block.name)) + continue; + + get_lf_str_list(block, pcifunc, buf); + if (lf_str_size <= strlen(buf)) + lf_str_size = strlen(buf) + 1; + } + } + } + + kfree(buf); + return lf_str_size; +} + /* Dumps current provisioning status of all RVU block LFs */ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { - int index, off = 0, flag = 0, go_back = 0, len = 0; + int index, off = 0, flag = 0, len = 0, i = 0; struct rvu *rvu = filp->private_data; - int lf, pf, vf, pcifunc; + int bytes_not_copied = 0; struct rvu_block block; - int bytes_not_copied; - int lf_str_size = 12; + int pf, vf, pcifunc; int buf_size = 2048; + int lf_str_size; char *lfs; char *buf; @@ -249,6 +316,9 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, if (!buf) return -ENOSPC; + /* Get the maximum width of a column */ + lf_str_size = get_max_column_width(rvu); + lfs = kzalloc(lf_str_size, GFP_KERNEL); if (!lfs) { kfree(buf); @@ -262,65 +332,69 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp, "%-*s", lf_str_size, rvu->hw->block[index].name); } + off += scnprintf(&buf[off], buf_size - 1 - off, "\n"); + bytes_not_copied = copy_to_user(buffer + (i * off), buf, off); + if (bytes_not_copied) + goto out; + + i++; + *ppos += off; for (pf = 0; pf < rvu->hw->total_pfs; pf++) { for (vf = 0; vf <= rvu->hw->total_vfs; vf++) { + off = 0; + flag = 0; pcifunc = pf << 10 | vf; if (!pcifunc) continue; if (vf) { sprintf(lfs, "PF%d:VF%d", pf, vf - 1); - go_back = scnprintf(&buf[off], - buf_size - 1 - off, - "%-*s", lf_str_size, lfs); + off = scnprintf(&buf[off], + buf_size - 1 - off, + "%-*s", lf_str_size, lfs); } else { sprintf(lfs, "PF%d", pf); - go_back = scnprintf(&buf[off], - buf_size - 1 - off, - "%-*s", lf_str_size, lfs); + off = scnprintf(&buf[off], + buf_size - 1 - off, + "%-*s", lf_str_size, lfs); } - off += go_back; - for (index = 0; index < BLKTYPE_MAX; index++) { + for (index = 0; index < BLK_COUNT; index++) { block = rvu->hw->block[index]; if (!strlen(block.name)) continue; len = 0; lfs[len] = '\0'; - for (lf = 0; lf < block.lf.max; lf++) { - if (block.fn_map[lf] != pcifunc) - continue; + get_lf_str_list(block, pcifunc, lfs); + if (strlen(lfs)) flag = 1; - len += sprintf(&lfs[len], "%d,", lf); - } - if (flag) - len--; - lfs[len] = '\0'; off += scnprintf(&buf[off], buf_size - 1 - off, "%-*s", lf_str_size, lfs); - if (!strlen(lfs)) - go_back += lf_str_size; } - if (!flag) - off -= go_back; - else - flag = 0; - off--; - off += scnprintf(&buf[off], buf_size - 1 - off, "\n"); + if (flag) { + off += scnprintf(&buf[off], + buf_size - 1 - off, "\n"); + bytes_not_copied = copy_to_user(buffer + + (i * off), + buf, off); + if (bytes_not_copied) + goto out; + + i++; + *ppos += off; + } } } - bytes_not_copied = copy_to_user(buffer, buf, off); +out: kfree(lfs); kfree(buf); - if (bytes_not_copied) return -EFAULT; - *ppos = off; - return off; + return *ppos; } RVU_DEBUG_FOPS(rsrc_status, rsrc_attach_status, NULL); @@ -504,7 +578,7 @@ static ssize_t rvu_dbg_qsize_write(struct file *filp, if (cmd_buf) ret = -EINVAL; - if (!strncmp(subtoken, "help", 4) || ret < 0) { + if (ret < 0 || !strncmp(subtoken, "help", 4)) { dev_info(rvu->dev, "Use echo <%s-lf > qsize\n", blk_string); goto qsize_write_done; } @@ -1719,6 +1793,10 @@ static int rvu_dbg_nix_band_prof_ctx_display(struct seq_file *m, void *unused) u16 pcifunc; char *str; + /* Ingress policers do not exist on all platforms */ + if (!nix_hw->ipolicer) + return 0; + for (layer = 0; layer < BAND_PROF_NUM_LAYERS; layer++) { if (layer == BAND_PROF_INVAL_LAYER) continue; @@ -1768,6 +1846,10 @@ static int rvu_dbg_nix_band_prof_rsrc_display(struct seq_file *m, void *unused) int layer; char *str; + /* Ingress policers do not exist on all platforms */ + if (!nix_hw->ipolicer) + return 0; + seq_puts(m, "\nBandwidth profile resource free count\n"); seq_puts(m, "=====================================\n"); for (layer = 0; layer < BAND_PROF_NUM_LAYERS; layer++) { diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 9ef4e942e31e..6970540dc470 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -2507,6 +2507,9 @@ static void nix_free_tx_vtag_entries(struct rvu *rvu, u16 pcifunc) return; nix_hw = get_nix_hw(rvu->hw, blkaddr); + if (!nix_hw) + return; + vlan = &nix_hw->txvlan; mutex_lock(&vlan->rsrc_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index cf97985628ab..02e77ffe5c3e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -155,6 +155,8 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; int err; + mlx5_debug_cq_remove(dev, cq); + mlx5_eq_del_cq(mlx5_get_async_eq(dev), cq); mlx5_eq_del_cq(&cq->eq->core, cq); @@ -162,16 +164,13 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); MLX5_SET(destroy_cq_in, in, uid, cq->uid); err = mlx5_cmd_exec_in(dev, destroy_cq, in); - if (err) - return err; synchronize_irq(cq->irqn); - mlx5_debug_cq_remove(dev, cq); mlx5_cq_put(cq); wait_for_completion(&cq->free); - return 0; + return err; } EXPORT_SYMBOL(mlx5_core_destroy_cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 7b8c8187543a..03a7a4ce5cd5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -252,6 +252,7 @@ struct mlx5e_params { struct { u16 mode; u8 num_tc; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; } mqprio; bool rx_cqe_compress_def; bool tunneled_offload_en; @@ -845,6 +846,7 @@ struct mlx5e_priv { struct mlx5e_channel_stats channel_stats[MLX5E_MAX_NUM_CHANNELS]; struct mlx5e_channel_stats trap_stats; struct mlx5e_ptp_stats ptp_stats; + u16 stats_nch; u16 max_nch; u8 max_opened_tc; bool tx_ptp_opened; @@ -1100,12 +1102,6 @@ int mlx5e_ethtool_set_pauseparam(struct mlx5e_priv *priv, struct ethtool_pauseparam *pauseparam); /* mlx5e generic netdev management API */ -static inline unsigned int -mlx5e_calc_max_nch(struct mlx5e_priv *priv, const struct mlx5e_profile *profile) -{ - return priv->netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1); -} - static inline bool mlx5e_tx_mpwqe_supported(struct mlx5_core_dev *mdev) { @@ -1114,11 +1110,13 @@ mlx5e_tx_mpwqe_supported(struct mlx5_core_dev *mdev) } int mlx5e_priv_init(struct mlx5e_priv *priv, + const struct mlx5e_profile *profile, struct net_device *netdev, struct mlx5_core_dev *mdev); void mlx5e_priv_cleanup(struct mlx5e_priv *priv); struct net_device * -mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int rxqs); +mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile, + unsigned int txqs, unsigned int rxqs); int mlx5e_attach_netdev(struct mlx5e_priv *priv); void mlx5e_detach_netdev(struct mlx5e_priv *priv); void mlx5e_destroy_netdev(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 41684a6c44e9..a88a1a48229f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -199,6 +199,9 @@ void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv); int mlx5e_create_flow_steering(struct mlx5e_priv *priv); void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv); +int mlx5e_fs_init(struct mlx5e_priv *priv); +void mlx5e_fs_cleanup(struct mlx5e_priv *priv); + int mlx5e_add_vlan_trap(struct mlx5e_priv *priv, int trap_id, int tir_num); void mlx5e_remove_vlan_trap(struct mlx5e_priv *priv); int mlx5e_add_mac_trap(struct mlx5e_priv *priv, int trap_id, int tir_num); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c index ac44bbe95c5c..d290d7276b8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c @@ -35,7 +35,7 @@ static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, void *data, { int ch, i = 0; - for (ch = 0; ch < priv->max_nch; ch++) { + for (ch = 0; ch < priv->stats_nch; ch++) { void *buf = data + i; if (WARN_ON_ONCE(buf + @@ -51,7 +51,7 @@ static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, void *data, static int mlx5e_hv_vhca_stats_buf_size(struct mlx5e_priv *priv) { return (sizeof(struct mlx5e_hv_vhca_per_ring_stats) * - priv->max_nch); + priv->stats_nch); } static void mlx5e_hv_vhca_stats_work(struct work_struct *work) @@ -100,7 +100,7 @@ static void mlx5e_hv_vhca_stats_control(struct mlx5_hv_vhca_agent *agent, sagent = &priv->stats_agent; block->version = MLX5_HV_VHCA_STATS_VERSION; - block->rings = priv->max_nch; + block->rings = priv->stats_nch; if (!block->command) { cancel_delayed_work_sync(&priv->stats_agent.work); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index ee688dec67a9..3a86f66d1295 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -13,8 +13,6 @@ struct mlx5e_ptp_fs { bool valid; }; -#define MLX5E_PTP_CHANNEL_IX 0 - struct mlx5e_ptp_params { struct mlx5e_params params; struct mlx5e_sq_param txq_sq_param; @@ -509,6 +507,7 @@ static int mlx5e_init_ptp_rq(struct mlx5e_ptp *c, struct mlx5e_params *params, rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); rq->stats = &c->priv->ptp_stats.rq; + rq->ix = MLX5E_PTP_CHANNEL_IX; rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); err = mlx5e_rq_set_handlers(rq, params, false); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index c96668bd701c..a71a32e00ebb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -8,6 +8,8 @@ #include "en_stats.h" #include <linux/ptp_classify.h> +#define MLX5E_PTP_CHANNEL_IX 0 + struct mlx5e_ptpsq { struct mlx5e_txqsq txqsq; struct mlx5e_cq ts_cq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index b5ddaa82755f..c6d2f8c78db7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -475,9 +475,6 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) esw_warn(mdev, "Failed to allocate bridge offloads workqueue\n"); goto err_alloc_wq; } - INIT_DELAYED_WORK(&br_offloads->update_work, mlx5_esw_bridge_update_work); - queue_delayed_work(br_offloads->wq, &br_offloads->update_work, - msecs_to_jiffies(MLX5_ESW_BRIDGE_UPDATE_INTERVAL)); br_offloads->nb.notifier_call = mlx5_esw_bridge_switchdev_event; err = register_switchdev_notifier(&br_offloads->nb); @@ -500,6 +497,9 @@ void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) err); goto err_register_netdev; } + INIT_DELAYED_WORK(&br_offloads->update_work, mlx5_esw_bridge_update_work); + queue_delayed_work(br_offloads->wq, &br_offloads->update_work, + msecs_to_jiffies(MLX5_ESW_BRIDGE_UPDATE_INTERVAL)); return; err_register_netdev: @@ -523,10 +523,10 @@ void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) if (!br_offloads) return; + cancel_delayed_work_sync(&br_offloads->update_work); unregister_netdevice_notifier(&br_offloads->netdev_nb); unregister_switchdev_blocking_notifier(&br_offloads->nb_blk); unregister_switchdev_notifier(&br_offloads->nb); - cancel_delayed_work(&br_offloads->update_work); destroy_workqueue(br_offloads->wq); rtnl_lock(); mlx5_esw_bridge_cleanup(esw); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index b4e986818794..4a13ef561587 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -10,6 +10,8 @@ #include "en_tc.h" #include "rep/tc.h" #include "rep/neigh.h" +#include "lag.h" +#include "lag_mp.h" struct mlx5e_tc_tun_route_attr { struct net_device *out_dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index 33de8f0092a6..fb5397324aa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -141,8 +141,7 @@ static void mlx5e_ipsec_set_swp(struct sk_buff *skb, * Pkt: MAC IP ESP IP L4 * * Transport Mode: - * SWP: OutL3 InL4 - * InL3 + * SWP: OutL3 OutL4 * Pkt: MAC IP ESP L4 * * Tunnel(VXLAN TCP/UDP) over Transport Mode @@ -171,31 +170,35 @@ static void mlx5e_ipsec_set_swp(struct sk_buff *skb, return; if (!xo->inner_ipproto) { - eseg->swp_inner_l3_offset = skb_network_offset(skb) / 2; - eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; - if (skb->protocol == htons(ETH_P_IPV6)) - eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; - if (xo->proto == IPPROTO_UDP) + switch (xo->proto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_UDP; + fallthrough; + case IPPROTO_TCP: + /* IP | ESP | TCP */ + eseg->swp_outer_l4_offset = skb_inner_transport_offset(skb) / 2; + break; + default: + break; + } + } else { + /* Tunnel(VXLAN TCP/UDP) over Transport Mode */ + switch (xo->inner_ipproto) { + case IPPROTO_UDP: eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; - return; - } - - /* Tunnel(VXLAN TCP/UDP) over Transport Mode */ - switch (xo->inner_ipproto) { - case IPPROTO_UDP: - eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; - fallthrough; - case IPPROTO_TCP: - eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; - eseg->swp_inner_l4_offset = (skb->csum_start + skb->head - skb->data) / 2; - if (skb->protocol == htons(ETH_P_IPV6)) - eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; - break; - default: - break; + fallthrough; + case IPPROTO_TCP: + eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; + eseg->swp_inner_l4_offset = + (skb->csum_start + skb->head - skb->data) / 2; + if (skb->protocol == htons(ETH_P_IPV6)) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; + break; + default: + break; + } } - return; } void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 306fb5d6a36d..9d451b8ee467 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -2036,6 +2036,17 @@ static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable) } new_params = priv->channels.params; + /* Don't allow enabling TX-port-TS if MQPRIO mode channel offload is + * active, since it defines explicitly which TC accepts the packet. + * This conflicts with TX-port-TS hijacking the PTP traffic to a specific + * HW TX-queue. + */ + if (enable && new_params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + netdev_err(priv->netdev, + "%s: MQPRIO mode channel offload is active, cannot set the TX-port-TS\n", + __func__); + return -EINVAL; + } MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_PORT_TS, enable); /* No need to verify SQ stop room as * ptpsq.txqsq.stop_room <= generic_sq->stop_room, and both diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index c06b4b938ae7..d226cc5ab1d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -1186,10 +1186,6 @@ static int mlx5e_create_vlan_table(struct mlx5e_priv *priv) struct mlx5e_flow_table *ft; int err; - priv->fs.vlan = kvzalloc(sizeof(*priv->fs.vlan), GFP_KERNEL); - if (!priv->fs.vlan) - return -ENOMEM; - ft = &priv->fs.vlan->ft; ft->num_groups = 0; @@ -1198,10 +1194,8 @@ static int mlx5e_create_vlan_table(struct mlx5e_priv *priv) ft_attr.prio = MLX5E_NIC_PRIO; ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr); - if (IS_ERR(ft->t)) { - err = PTR_ERR(ft->t); - goto err_free_t; - } + if (IS_ERR(ft->t)) + return PTR_ERR(ft->t); ft->g = kcalloc(MLX5E_NUM_VLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL); if (!ft->g) { @@ -1221,9 +1215,6 @@ err_free_g: kfree(ft->g); err_destroy_vlan_table: mlx5_destroy_flow_table(ft->t); -err_free_t: - kvfree(priv->fs.vlan); - priv->fs.vlan = NULL; return err; } @@ -1232,7 +1223,6 @@ static void mlx5e_destroy_vlan_table(struct mlx5e_priv *priv) { mlx5e_del_vlan_rules(priv); mlx5e_destroy_flow_table(&priv->fs.vlan->ft); - kvfree(priv->fs.vlan); } static void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv) @@ -1351,3 +1341,17 @@ void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv) mlx5e_arfs_destroy_tables(priv); mlx5e_ethtool_cleanup_steering(priv); } + +int mlx5e_fs_init(struct mlx5e_priv *priv) +{ + priv->fs.vlan = kvzalloc(sizeof(*priv->fs.vlan), GFP_KERNEL); + if (!priv->fs.vlan) + return -ENOMEM; + return 0; +} + +void mlx5e_fs_cleanup(struct mlx5e_priv *priv) +{ + kvfree(priv->fs.vlan); + priv->fs.vlan = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3fd515e7bf30..41ef6eb70a58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2264,7 +2264,7 @@ void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv) } static int mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc, - struct tc_mqprio_qopt_offload *mqprio) + struct netdev_tc_txq *tc_to_txq) { int tc, err; @@ -2282,11 +2282,8 @@ static int mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc, for (tc = 0; tc < ntc; tc++) { u16 count, offset; - /* For DCB mode, map netdev TCs to offset 0 - * We have our own UP to TXQ mapping for QoS - */ - count = mqprio ? mqprio->qopt.count[tc] : nch; - offset = mqprio ? mqprio->qopt.offset[tc] : 0; + count = tc_to_txq[tc].count; + offset = tc_to_txq[tc].offset; netdev_set_tc_queue(netdev, tc, count, offset); } @@ -2315,19 +2312,24 @@ int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv) static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) { + struct netdev_tc_txq old_tc_to_txq[TC_MAX_QUEUE], *tc_to_txq; struct net_device *netdev = priv->netdev; int old_num_txqs, old_ntc; int num_rxqs, nch, ntc; int err; + int i; old_num_txqs = netdev->real_num_tx_queues; old_ntc = netdev->num_tc ? : 1; + for (i = 0; i < ARRAY_SIZE(old_tc_to_txq); i++) + old_tc_to_txq[i] = netdev->tc_to_txq[i]; nch = priv->channels.params.num_channels; - ntc = mlx5e_get_dcb_num_tc(&priv->channels.params); + ntc = priv->channels.params.mqprio.num_tc; num_rxqs = nch * priv->profile->rq_groups; + tc_to_txq = priv->channels.params.mqprio.tc_to_txq; - err = mlx5e_netdev_set_tcs(netdev, nch, ntc, NULL); + err = mlx5e_netdev_set_tcs(netdev, nch, ntc, tc_to_txq); if (err) goto err_out; err = mlx5e_update_tx_netdev_queues(priv); @@ -2350,11 +2352,14 @@ err_txqs: WARN_ON_ONCE(netif_set_real_num_tx_queues(netdev, old_num_txqs)); err_tcs: - mlx5e_netdev_set_tcs(netdev, old_num_txqs / old_ntc, old_ntc, NULL); + WARN_ON_ONCE(mlx5e_netdev_set_tcs(netdev, old_num_txqs / old_ntc, old_ntc, + old_tc_to_txq)); err_out: return err; } +static MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_update_netdev_queues); + static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv, struct mlx5e_params *params) { @@ -2861,6 +2866,58 @@ static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd) return 0; } +static void mlx5e_mqprio_build_default_tc_to_txq(struct netdev_tc_txq *tc_to_txq, + int ntc, int nch) +{ + int tc; + + memset(tc_to_txq, 0, sizeof(*tc_to_txq) * TC_MAX_QUEUE); + + /* Map netdev TCs to offset 0. + * We have our own UP to TXQ mapping for DCB mode of QoS + */ + for (tc = 0; tc < ntc; tc++) { + tc_to_txq[tc] = (struct netdev_tc_txq) { + .count = nch, + .offset = 0, + }; + } +} + +static void mlx5e_mqprio_build_tc_to_txq(struct netdev_tc_txq *tc_to_txq, + struct tc_mqprio_qopt *qopt) +{ + int tc; + + for (tc = 0; tc < TC_MAX_QUEUE; tc++) { + tc_to_txq[tc] = (struct netdev_tc_txq) { + .count = qopt->count[tc], + .offset = qopt->offset[tc], + }; + } +} + +static void mlx5e_params_mqprio_dcb_set(struct mlx5e_params *params, u8 num_tc) +{ + params->mqprio.mode = TC_MQPRIO_MODE_DCB; + params->mqprio.num_tc = num_tc; + mlx5e_mqprio_build_default_tc_to_txq(params->mqprio.tc_to_txq, num_tc, + params->num_channels); +} + +static void mlx5e_params_mqprio_channel_set(struct mlx5e_params *params, + struct tc_mqprio_qopt *qopt) +{ + params->mqprio.mode = TC_MQPRIO_MODE_CHANNEL; + params->mqprio.num_tc = qopt->num_tc; + mlx5e_mqprio_build_tc_to_txq(params->mqprio.tc_to_txq, qopt); +} + +static void mlx5e_params_mqprio_reset(struct mlx5e_params *params) +{ + mlx5e_params_mqprio_dcb_set(params, 1); +} + static int mlx5e_setup_tc_mqprio_dcb(struct mlx5e_priv *priv, struct tc_mqprio_qopt *mqprio) { @@ -2874,8 +2931,7 @@ static int mlx5e_setup_tc_mqprio_dcb(struct mlx5e_priv *priv, return -EINVAL; new_params = priv->channels.params; - new_params.mqprio.mode = TC_MQPRIO_MODE_DCB; - new_params.mqprio.num_tc = tc ? tc : 1; + mlx5e_params_mqprio_dcb_set(&new_params, tc ? tc : 1); err = mlx5e_safe_switch_params(priv, &new_params, mlx5e_num_channels_changed_ctx, NULL, true); @@ -2889,9 +2945,17 @@ static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, struct tc_mqprio_qopt_offload *mqprio) { struct net_device *netdev = priv->netdev; + struct mlx5e_ptp *ptp_channel; int agg_count = 0; int i; + ptp_channel = priv->channels.ptp; + if (ptp_channel && test_bit(MLX5E_PTP_STATE_TX, ptp_channel->state)) { + netdev_err(netdev, + "Cannot activate MQPRIO mode channel since it conflicts with TX port TS\n"); + return -EINVAL; + } + if (mqprio->qopt.offset[0] != 0 || mqprio->qopt.num_tc < 1 || mqprio->qopt.num_tc > MLX5E_MAX_NUM_MQPRIO_CH_TC) return -EINVAL; @@ -2917,8 +2981,8 @@ static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, agg_count += mqprio->qopt.count[i]; } - if (priv->channels.params.num_channels < agg_count) { - netdev_err(netdev, "Num of queues (%d) exceeds available (%d)\n", + if (priv->channels.params.num_channels != agg_count) { + netdev_err(netdev, "Num of queues (%d) does not match available (%d)\n", agg_count, priv->channels.params.num_channels); return -EINVAL; } @@ -2926,25 +2990,12 @@ static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, return 0; } -static int mlx5e_mqprio_channel_set_tcs_ctx(struct mlx5e_priv *priv, void *ctx) -{ - struct tc_mqprio_qopt_offload *mqprio = (struct tc_mqprio_qopt_offload *)ctx; - struct net_device *netdev = priv->netdev; - u8 num_tc; - - if (priv->channels.params.mqprio.mode != TC_MQPRIO_MODE_CHANNEL) - return -EINVAL; - - num_tc = priv->channels.params.mqprio.num_tc; - mlx5e_netdev_set_tcs(netdev, 0, num_tc, mqprio); - - return 0; -} - static int mlx5e_setup_tc_mqprio_channel(struct mlx5e_priv *priv, struct tc_mqprio_qopt_offload *mqprio) { + mlx5e_fp_preactivate preactivate; struct mlx5e_params new_params; + bool nch_changed; int err; err = mlx5e_mqprio_channel_validate(priv, mqprio); @@ -2952,12 +3003,12 @@ static int mlx5e_setup_tc_mqprio_channel(struct mlx5e_priv *priv, return err; new_params = priv->channels.params; - new_params.mqprio.mode = TC_MQPRIO_MODE_CHANNEL; - new_params.mqprio.num_tc = mqprio->qopt.num_tc; - err = mlx5e_safe_switch_params(priv, &new_params, - mlx5e_mqprio_channel_set_tcs_ctx, mqprio, true); + mlx5e_params_mqprio_channel_set(&new_params, &mqprio->qopt); - return err; + nch_changed = mlx5e_get_dcb_num_tc(&priv->channels.params) > 1; + preactivate = nch_changed ? mlx5e_num_channels_changed_ctx : + mlx5e_update_netdev_queues_ctx; + return mlx5e_safe_switch_params(priv, &new_params, preactivate, NULL, true); } static int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, @@ -3065,7 +3116,7 @@ void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s) { int i; - for (i = 0; i < priv->max_nch; i++) { + for (i = 0; i < priv->stats_nch; i++) { struct mlx5e_channel_stats *channel_stats = &priv->channel_stats[i]; struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq; struct mlx5e_rq_stats *rq_stats = &channel_stats->rq; @@ -3274,20 +3325,67 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable) return mlx5_set_port_fcs(mdev, !enable); } +static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {}; + bool supported, curr_state; + int err; + + if (!MLX5_CAP_GEN(mdev, ports_check)) + return 0; + + err = mlx5_query_ports_check(mdev, in, sizeof(in)); + if (err) + return err; + + supported = MLX5_GET(pcmr_reg, in, rx_ts_over_crc_cap); + curr_state = MLX5_GET(pcmr_reg, in, rx_ts_over_crc); + + if (!supported || enable == curr_state) + return 0; + + MLX5_SET(pcmr_reg, in, local_port, 1); + MLX5_SET(pcmr_reg, in, rx_ts_over_crc, enable); + + return mlx5_set_ports_check(mdev, in, sizeof(in)); +} + static int set_feature_rx_fcs(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_channels *chs = &priv->channels; + struct mlx5_core_dev *mdev = priv->mdev; int err; mutex_lock(&priv->state_lock); - priv->channels.params.scatter_fcs_en = enable; - err = mlx5e_modify_channels_scatter_fcs(&priv->channels, enable); - if (err) - priv->channels.params.scatter_fcs_en = !enable; + if (enable) { + err = mlx5e_set_rx_port_ts(mdev, false); + if (err) + goto out; - mutex_unlock(&priv->state_lock); + chs->params.scatter_fcs_en = true; + err = mlx5e_modify_channels_scatter_fcs(chs, true); + if (err) { + chs->params.scatter_fcs_en = false; + mlx5e_set_rx_port_ts(mdev, true); + } + } else { + chs->params.scatter_fcs_en = false; + err = mlx5e_modify_channels_scatter_fcs(chs, false); + if (err) { + chs->params.scatter_fcs_en = true; + goto out; + } + err = mlx5e_set_rx_port_ts(mdev, true); + if (err) { + mlx5_core_warn(mdev, "Failed to set RX port timestamp %d\n", err); + err = 0; + } + } +out: + mutex_unlock(&priv->state_lock); return err; } @@ -4186,13 +4284,11 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 struct mlx5_core_dev *mdev = priv->mdev; u8 rx_cq_period_mode; - priv->max_nch = mlx5e_calc_max_nch(priv, priv->profile); - params->sw_mtu = mtu; params->hard_mtu = MLX5E_ETH_HARD_MTU; params->num_channels = min_t(unsigned int, MLX5E_MAX_NUM_CHANNELS / 2, priv->max_nch); - params->mqprio.num_tc = 1; + mlx5e_params_mqprio_reset(params); /* Set an initial non-zero value, so that mlx5e_select_queue won't * divide by zero if called before first activating channels. @@ -4482,6 +4578,12 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, mlx5e_timestamp_init(priv); + err = mlx5e_fs_init(priv); + if (err) { + mlx5_core_err(mdev, "FS initialization failed, %d\n", err); + return err; + } + err = mlx5e_ipsec_init(priv); if (err) mlx5_core_err(mdev, "IPSec initialization failed, %d\n", err); @@ -4499,6 +4601,7 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) mlx5e_health_destroy_reporters(priv); mlx5e_tls_cleanup(priv); mlx5e_ipsec_cleanup(priv); + mlx5e_fs_cleanup(priv); } static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) @@ -4682,8 +4785,35 @@ static const struct mlx5e_profile mlx5e_nic_profile = { .rx_ptp_support = true, }; +static unsigned int +mlx5e_calc_max_nch(struct mlx5_core_dev *mdev, struct net_device *netdev, + const struct mlx5e_profile *profile) + +{ + unsigned int max_nch, tmp; + + /* core resources */ + max_nch = mlx5e_get_max_num_channels(mdev); + + /* netdev rx queues */ + tmp = netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1); + max_nch = min_t(unsigned int, max_nch, tmp); + + /* netdev tx queues */ + tmp = netdev->num_tx_queues; + if (mlx5_qos_is_supported(mdev)) + tmp -= mlx5e_qos_max_leaf_nodes(mdev); + if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) + tmp -= profile->max_tc; + tmp = tmp / profile->max_tc; + max_nch = min_t(unsigned int, max_nch, tmp); + + return max_nch; +} + /* mlx5e generic netdev management API (move to en_common.c) */ int mlx5e_priv_init(struct mlx5e_priv *priv, + const struct mlx5e_profile *profile, struct net_device *netdev, struct mlx5_core_dev *mdev) { @@ -4691,6 +4821,8 @@ int mlx5e_priv_init(struct mlx5e_priv *priv, priv->mdev = mdev; priv->netdev = netdev; priv->msglevel = MLX5E_MSG_LEVEL; + priv->max_nch = mlx5e_calc_max_nch(mdev, netdev, profile); + priv->stats_nch = priv->max_nch; priv->max_opened_tc = 1; if (!alloc_cpumask_var(&priv->scratchpad.cpumask, GFP_KERNEL)) @@ -4734,7 +4866,8 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) } struct net_device * -mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int rxqs) +mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile, + unsigned int txqs, unsigned int rxqs) { struct net_device *netdev; int err; @@ -4745,7 +4878,7 @@ mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int return NULL; } - err = mlx5e_priv_init(netdev_priv(netdev), netdev, mdev); + err = mlx5e_priv_init(netdev_priv(netdev), profile, netdev, mdev); if (err) { mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); goto err_free_netdev; @@ -4787,7 +4920,7 @@ int mlx5e_attach_netdev(struct mlx5e_priv *priv) clear_bit(MLX5E_STATE_DESTROYING, &priv->state); /* max number of channels may have changed */ - max_nch = mlx5e_get_max_num_channels(priv->mdev); + max_nch = mlx5e_calc_max_nch(priv->mdev, priv->netdev, profile); if (priv->channels.params.num_channels > max_nch) { mlx5_core_warn(priv->mdev, "MLX5E: Reducing number of channels to %d\n", max_nch); /* Reducing the number of channels - RXFH has to be reset, and @@ -4795,7 +4928,18 @@ int mlx5e_attach_netdev(struct mlx5e_priv *priv) */ priv->netdev->priv_flags &= ~IFF_RXFH_CONFIGURED; priv->channels.params.num_channels = max_nch; + if (priv->channels.params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + mlx5_core_warn(priv->mdev, "MLX5E: Disabling MQPRIO channel mode\n"); + mlx5e_params_mqprio_reset(&priv->channels.params); + } + } + if (max_nch != priv->max_nch) { + mlx5_core_warn(priv->mdev, + "MLX5E: Updating max number of channels from %u to %u\n", + priv->max_nch, max_nch); + priv->max_nch = max_nch; } + /* 1. Set the real number of queues in the kernel the first time. * 2. Set our default XPS cpumask. * 3. Build the RQT. @@ -4860,7 +5004,7 @@ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mde struct mlx5e_priv *priv = netdev_priv(netdev); int err; - err = mlx5e_priv_init(priv, netdev, mdev); + err = mlx5e_priv_init(priv, new_profile, netdev, mdev); if (err) { mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); return err; @@ -4886,20 +5030,12 @@ priv_cleanup: int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, const struct mlx5e_profile *new_profile, void *new_ppriv) { - unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile); const struct mlx5e_profile *orig_profile = priv->profile; struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; void *orig_ppriv = priv->ppriv; int err, rollback_err; - /* sanity */ - if (new_max_nch != priv->max_nch) { - netdev_warn(netdev, "%s: Replacing profile with different max channels\n", - __func__); - return -EINVAL; - } - /* cleanup old profile */ mlx5e_detach_netdev(priv); priv->profile->cleanup(priv); @@ -4995,7 +5131,7 @@ static int mlx5e_probe(struct auxiliary_device *adev, nch = mlx5e_get_max_num_channels(mdev); txqs = nch * profile->max_tc + ptp_txqs + qos_sqs; rxqs = nch * profile->rq_groups; - netdev = mlx5e_create_netdev(mdev, txqs, rxqs); + netdev = mlx5e_create_netdev(mdev, profile, txqs, rxqs); if (!netdev) { mlx5_core_err(mdev, "mlx5e_create_netdev failed\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index ae71a17fdb27..0684ac6699b2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -596,7 +596,6 @@ static void mlx5e_build_rep_params(struct net_device *netdev) MLX5_CQ_PERIOD_MODE_START_FROM_CQE : MLX5_CQ_PERIOD_MODE_START_FROM_EQE; - priv->max_nch = mlx5e_calc_max_nch(priv, priv->profile); params = &priv->channels.params; params->num_channels = MLX5E_REP_PARAMS_DEF_NUM_CHANNELS; @@ -619,6 +618,11 @@ static void mlx5e_build_rep_params(struct net_device *netdev) params->mqprio.num_tc = 1; params->tunneled_offload_en = false; + /* Set an initial non-zero value, so that mlx5e_select_queue won't + * divide by zero if called before first activating channels. + */ + priv->num_tc_x_num_ch = params->num_channels * params->mqprio.num_tc; + mlx5_query_min_inline(mdev, ¶ms->tx_min_inline_mode); } @@ -644,7 +648,6 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev, netdev->hw_features |= NETIF_F_RXCSUM; netdev->features |= netdev->hw_features; - netdev->features |= NETIF_F_VLAN_CHALLENGED; netdev->features |= NETIF_F_NETNS_LOCAL; } @@ -1169,7 +1172,7 @@ mlx5e_vport_vf_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) nch = mlx5e_get_max_num_channels(dev); txqs = nch * profile->max_tc; rxqs = nch * profile->rq_groups; - netdev = mlx5e_create_netdev(dev, txqs, rxqs); + netdev = mlx5e_create_netdev(dev, profile, txqs, rxqs); if (!netdev) { mlx5_core_warn(dev, "Failed to create representor netdev for vport %d\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 3c65fd0bcf31..29a6586ef28d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1001,14 +1001,9 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, goto csum_unnecessary; if (likely(is_last_ethertype_ip(skb, &network_depth, &proto))) { - u8 ipproto = get_ip_proto(skb, network_depth, proto); - - if (unlikely(ipproto == IPPROTO_SCTP)) + if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP)) goto csum_unnecessary; - if (unlikely(mlx5_ipsec_is_rx_flow(cqe))) - goto csum_none; - stats->csum_complete++; skb->ip_summed = CHECKSUM_COMPLETE; skb->csum = csum_unfold((__force __sum16)cqe->check_sum); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index e4f5b6395148..e1dd17019030 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -34,6 +34,7 @@ #include "en.h" #include "en_accel/tls.h" #include "en_accel/en_accel.h" +#include "en/ptp.h" static unsigned int stats_grps_num(struct mlx5e_priv *priv) { @@ -450,7 +451,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw) memset(s, 0, sizeof(*s)); - for (i = 0; i < priv->max_nch; i++) { + for (i = 0; i < priv->stats_nch; i++) { struct mlx5e_channel_stats *channel_stats = &priv->channel_stats[i]; int j; @@ -2076,7 +2077,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ptp) if (priv->rx_ptp_opened) { for (i = 0; i < NUM_PTP_RQ_STATS; i++) sprintf(data + (idx++) * ETH_GSTRING_LEN, - ptp_rq_stats_desc[i].format); + ptp_rq_stats_desc[i].format, MLX5E_PTP_CHANNEL_IX); } return idx; } @@ -2119,7 +2120,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ptp) { return; } static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels) { - int max_nch = priv->max_nch; + int max_nch = priv->stats_nch; return (NUM_RQ_STATS * max_nch) + (NUM_CH_STATS * max_nch) + @@ -2133,7 +2134,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels) static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels) { bool is_xsk = priv->xsk.ever_used; - int max_nch = priv->max_nch; + int max_nch = priv->stats_nch; int i, j, tc; for (i = 0; i < max_nch; i++) @@ -2175,7 +2176,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels) static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(channels) { bool is_xsk = priv->xsk.ever_used; - int max_nch = priv->max_nch; + int max_nch = priv->stats_nch; int i, j, tc; for (i = 0; i < max_nch; i++) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index ba8164792016..129ff7e0d65c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -67,6 +67,8 @@ #include "lib/fs_chains.h" #include "diag/en_tc_tracepoint.h" #include <asm/div64.h> +#include "lag.h" +#include "lag_mp.h" #define nic_chains(priv) ((priv)->fs.tc.chains) #define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index c63d78eda606..188994d091c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -213,19 +213,18 @@ static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); } -/* If packet is not IP's CHECKSUM_PARTIAL (e.g. icmd packet), - * need to set L3 checksum flag for IPsec - */ static void ipsec_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { + struct xfrm_offload *xo = xfrm_offload(skb); + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; - if (skb->encapsulation) { - eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (xo->inner_ipproto) { + eseg->cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM | MLX5_ETH_WQE_L3_INNER_CSUM; + } else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { + eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; sq->stats->csum_partial_inner++; - } else { - sq->stats->csum_partial++; } } @@ -234,6 +233,11 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_accel_tx_state *accel, struct mlx5_wqe_eth_seg *eseg) { + if (unlikely(mlx5e_ipsec_eseg_meta(eseg))) { + ipsec_txwqe_build_eseg_csum(sq, skb, eseg); + return; + } + if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; if (skb->encapsulation) { @@ -249,8 +253,6 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; sq->stats->csum_partial++; #endif - } else if (unlikely(mlx5e_ipsec_eseg_meta(eseg))) { - ipsec_txwqe_build_eseg_csum(sq, skb, eseg); } else sq->stats->csum_none++; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 0399a396d166..60a73990017c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -79,12 +79,16 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, int dest_num = 0; int err = 0; - if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { + if (vport->egress.legacy.drop_counter) { + drop_counter = vport->egress.legacy.drop_counter; + } else if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { drop_counter = mlx5_fc_create(esw->dev, false); - if (IS_ERR(drop_counter)) + if (IS_ERR(drop_counter)) { esw_warn(esw->dev, "vport[%d] configure egress drop rule counter err(%ld)\n", vport->vport, PTR_ERR(drop_counter)); + drop_counter = NULL; + } vport->egress.legacy.drop_counter = drop_counter; } @@ -123,7 +127,7 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; /* Attach egress drop flow counter */ - if (!IS_ERR_OR_NULL(drop_counter)) { + if (drop_counter) { flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; drop_ctr_dst.counter_id = mlx5_fc_id(drop_counter); @@ -162,7 +166,7 @@ void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, esw_acl_egress_table_destroy(vport); clean_drop_counter: - if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_counter)) { + if (vport->egress.legacy.drop_counter) { mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); vport->egress.legacy.drop_counter = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index f75b86abaf1c..b1a5199260f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -160,7 +160,9 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, esw_acl_ingress_lgcy_rules_destroy(vport); - if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { + if (vport->ingress.legacy.drop_counter) { + counter = vport->ingress.legacy.drop_counter; + } else if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { counter = mlx5_fc_create(esw->dev, false); if (IS_ERR(counter)) { esw_warn(esw->dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 985e305179d1..c6cc67cb4f6a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -473,10 +473,9 @@ esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta err_min_rate: list_del(&group->list); - err = mlx5_destroy_scheduling_element_cmd(esw->dev, - SCHEDULING_HIERARCHY_E_SWITCH, - group->tsar_ix); - if (err) + if (mlx5_destroy_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + group->tsar_ix)) NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR for group failed"); err_sched_elem: kfree(group); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 67571e5040d6..269ebb53eda6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -113,7 +113,7 @@ static void mlx5i_grp_sw_update_stats(struct mlx5e_priv *priv) struct mlx5e_sw_stats s = { 0 }; int i, j; - for (i = 0; i < priv->max_nch; i++) { + for (i = 0; i < priv->stats_nch; i++) { struct mlx5e_channel_stats *channel_stats; struct mlx5e_rq_stats *rq_stats; @@ -711,7 +711,7 @@ static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num, goto destroy_ht; } - err = mlx5e_priv_init(epriv, netdev, mdev); + err = mlx5e_priv_init(epriv, prof, netdev, mdev); if (err) goto destroy_mdev_resources; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index ca5690b0a7ab..d2105c1635c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -442,6 +442,10 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) if (!mlx5_lag_is_ready(ldev)) { do_bond = false; } else { + /* VF LAG is in multipath mode, ignore bond change requests */ + if (mlx5_lag_is_multipath(dev0)) + return; + tracker = ldev->tracker; do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c index f239b352a58a..21fdaf708f1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c @@ -9,20 +9,23 @@ #include "eswitch.h" #include "lib/mlx5.h" +static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH); +} + static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) { if (!mlx5_lag_is_ready(ldev)) return false; + if (__mlx5_lag_is_active(ldev) && !__mlx5_lag_is_multipath(ldev)) + return false; + return mlx5_esw_multipath_prereq(ldev->pf[MLX5_LAG_P1].dev, ldev->pf[MLX5_LAG_P2].dev); } -static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev) -{ - return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH); -} - bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h index 729c839397a8..dea199e79bed 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h @@ -24,12 +24,14 @@ struct lag_mp { void mlx5_lag_mp_reset(struct mlx5_lag *ldev); int mlx5_lag_mp_init(struct mlx5_lag *ldev); void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev); +bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev); #else /* CONFIG_MLX5_ESWITCH */ static inline void mlx5_lag_mp_reset(struct mlx5_lag *ldev) {}; static inline int mlx5_lag_mp_init(struct mlx5_lag *ldev) { return 0; } static inline void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) {} +bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) { return false; } #endif /* CONFIG_MLX5_ESWITCH */ #endif /* __MLX5_LAG_MP_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index ffac8a0e7a23..91e806c1aa21 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -448,22 +448,20 @@ static u64 find_target_cycles(struct mlx5_core_dev *mdev, s64 target_ns) return cycles_now + cycles_delta; } -static u64 perout_conf_internal_timer(struct mlx5_core_dev *mdev, - s64 sec, u32 nsec) +static u64 perout_conf_internal_timer(struct mlx5_core_dev *mdev, s64 sec) { - struct timespec64 ts; + struct timespec64 ts = {}; s64 target_ns; ts.tv_sec = sec; - ts.tv_nsec = nsec; target_ns = timespec64_to_ns(&ts); return find_target_cycles(mdev, target_ns); } -static u64 perout_conf_real_time(s64 sec, u32 nsec) +static u64 perout_conf_real_time(s64 sec) { - return (u64)nsec | (u64)sec << 32; + return (u64)sec << 32; } static int mlx5_perout_configure(struct ptp_clock_info *ptp, @@ -474,6 +472,7 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); + bool rt_mode = mlx5_real_time_mode(mdev); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; struct timespec64 ts; u32 field_select = 0; @@ -501,8 +500,10 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (on) { bool rt_mode = mlx5_real_time_mode(mdev); - u32 nsec; - s64 sec; + s64 sec = rq->perout.start.sec; + + if (rq->perout.start.nsec) + return -EINVAL; pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; @@ -513,14 +514,11 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if ((ns >> 1) != 500000000LL) return -EINVAL; - nsec = rq->perout.start.nsec; - sec = rq->perout.start.sec; - if (rt_mode && sec > U32_MAX) return -EINVAL; - time_stamp = rt_mode ? perout_conf_real_time(sec, nsec) : - perout_conf_internal_timer(mdev, sec, nsec); + time_stamp = rt_mode ? perout_conf_real_time(sec) : + perout_conf_internal_timer(mdev, sec); field_select |= MLX5_MTPPS_FS_PIN_MODE | MLX5_MTPPS_FS_PATTERN | @@ -538,6 +536,9 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (err) return err; + if (rt_mode) + return 0; + return mlx5_set_mtppse(mdev, pin, 0, MLX5_EVENT_MODE_REPETETIVE & on); } @@ -705,20 +706,14 @@ static void ts_next_sec(struct timespec64 *ts) static u64 perout_conf_next_event_timer(struct mlx5_core_dev *mdev, struct mlx5_clock *clock) { - bool rt_mode = mlx5_real_time_mode(mdev); struct timespec64 ts; s64 target_ns; - if (rt_mode) - ts = mlx5_ptp_gettimex_real_time(mdev, NULL); - else - mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); - + mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); ts_next_sec(&ts); target_ns = timespec64_to_ns(&ts); - return rt_mode ? perout_conf_real_time(ts.tv_sec, ts.tv_nsec) : - find_target_cycles(mdev, target_ns); + return find_target_cycles(mdev, target_ns); } static int mlx5_pps_event(struct notifier_block *nb, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index c79a10b3454d..763c83a02380 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -13,8 +13,8 @@ #endif #define MLX5_MAX_IRQ_NAME (32) -/* max irq_index is 255. three chars */ -#define MLX5_MAX_IRQ_IDX_CHARS (3) +/* max irq_index is 2047, so four chars */ +#define MLX5_MAX_IRQ_IDX_CHARS (4) #define MLX5_SFS_PER_CTRL_IRQ 64 #define MLX5_IRQ_CTRL_SF_MAX 8 @@ -633,8 +633,9 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev) int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table) { if (table->sf_comp_pool) - return table->sf_comp_pool->xa_num_irqs.max - - table->sf_comp_pool->xa_num_irqs.min + 1; + return min_t(int, num_online_cpus(), + table->sf_comp_pool->xa_num_irqs.max - + table->sf_comp_pool->xa_num_irqs.min + 1); else return mlx5_irq_table_get_num_comp(table); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c index 0998dcc9cac0..b29824448aa8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c @@ -24,16 +24,8 @@ #define MLXSW_THERMAL_ZONE_MAX_NAME 16 #define MLXSW_THERMAL_TEMP_SCORE_MAX GENMASK(31, 0) #define MLXSW_THERMAL_MAX_STATE 10 +#define MLXSW_THERMAL_MIN_STATE 2 #define MLXSW_THERMAL_MAX_DUTY 255 -/* Minimum and maximum fan allowed speed in percent: from 20% to 100%. Values - * MLXSW_THERMAL_MAX_STATE + x, where x is between 2 and 10 are used for - * setting fan speed dynamic minimum. For example, if value is set to 14 (40%) - * cooling levels vector will be set to 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10 to - * introduce PWM speed in percent: 40, 40, 40, 40, 40, 50, 60. 70, 80, 90, 100. - */ -#define MLXSW_THERMAL_SPEED_MIN (MLXSW_THERMAL_MAX_STATE + 2) -#define MLXSW_THERMAL_SPEED_MAX (MLXSW_THERMAL_MAX_STATE * 2) -#define MLXSW_THERMAL_SPEED_MIN_LEVEL 2 /* 20% */ /* External cooling devices, allowed for binding to mlxsw thermal zones. */ static char * const mlxsw_thermal_external_allowed_cdev[] = { @@ -646,49 +638,16 @@ static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev, struct mlxsw_thermal *thermal = cdev->devdata; struct device *dev = thermal->bus_info->dev; char mfsc_pl[MLXSW_REG_MFSC_LEN]; - unsigned long cur_state, i; int idx; - u8 duty; int err; + if (state > MLXSW_THERMAL_MAX_STATE) + return -EINVAL; + idx = mlxsw_get_cooling_device_idx(thermal, cdev); if (idx < 0) return idx; - /* Verify if this request is for changing allowed fan dynamical - * minimum. If it is - update cooling levels accordingly and update - * state, if current state is below the newly requested minimum state. - * For example, if current state is 5, and minimal state is to be - * changed from 4 to 6, thermal->cooling_levels[0 to 5] will be changed - * all from 4 to 6. And state 5 (thermal->cooling_levels[4]) should be - * overwritten. - */ - if (state >= MLXSW_THERMAL_SPEED_MIN && - state <= MLXSW_THERMAL_SPEED_MAX) { - state -= MLXSW_THERMAL_MAX_STATE; - for (i = 0; i <= MLXSW_THERMAL_MAX_STATE; i++) - thermal->cooling_levels[i] = max(state, i); - - mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0); - err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl); - if (err) - return err; - - duty = mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl); - cur_state = mlxsw_duty_to_state(duty); - - /* If current fan state is lower than requested dynamical - * minimum, increase fan speed up to dynamical minimum. - */ - if (state < cur_state) - return 0; - - state = cur_state; - } - - if (state > MLXSW_THERMAL_MAX_STATE) - return -EINVAL; - /* Normalize the state to the valid speed range. */ state = thermal->cooling_levels[state]; mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state)); @@ -998,8 +957,7 @@ int mlxsw_thermal_init(struct mlxsw_core *core, /* Initialize cooling levels per PWM state. */ for (i = 0; i < MLXSW_THERMAL_MAX_STATE; i++) - thermal->cooling_levels[i] = max(MLXSW_THERMAL_SPEED_MIN_LEVEL, - i); + thermal->cooling_levels[i] = max(MLXSW_THERMAL_MIN_STATE, i); thermal->polling_delay = bus_info->low_frequency ? MLXSW_THERMAL_SLOW_POLL_INT : diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 13b0259f7ea6..fcace73eae40 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -353,13 +353,10 @@ static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci, struct sk_buff *skb; int err; - elem_info->u.rdq.skb = NULL; skb = netdev_alloc_skb_ip_align(NULL, buf_len); if (!skb) return -ENOMEM; - /* Assume that wqe was previously zeroed. */ - err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data, buf_len, DMA_FROM_DEVICE); if (err) @@ -597,21 +594,26 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, struct pci_dev *pdev = mlxsw_pci->pdev; struct mlxsw_pci_queue_elem_info *elem_info; struct mlxsw_rx_info rx_info = {}; - char *wqe; + char wqe[MLXSW_PCI_WQE_SIZE]; struct sk_buff *skb; u16 byte_count; int err; elem_info = mlxsw_pci_queue_elem_info_consumer_get(q); - skb = elem_info->u.sdq.skb; - if (!skb) - return; - wqe = elem_info->elem; - mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE); + skb = elem_info->u.rdq.skb; + memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE); if (q->consumer_counter++ != consumer_counter_limit) dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n"); + err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info); + if (err) { + dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n"); + goto out; + } + + mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE); + if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) { rx_info.is_lag = true; rx_info.u.lag_id = mlxsw_pci_cqe_lag_id_get(cqe_v, cqe); @@ -647,10 +649,7 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, skb_put(skb, byte_count); mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info); - memset(wqe, 0, q->elem_size); - err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info); - if (err) - dev_dbg_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n"); +out: /* Everything is set up, ring doorbell to pass elem to HW */ q->producer_counter++; mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q); diff --git a/drivers/net/ethernet/microchip/encx24j600-regmap.c b/drivers/net/ethernet/microchip/encx24j600-regmap.c index 796e46a53926..81a8ccca7e5e 100644 --- a/drivers/net/ethernet/microchip/encx24j600-regmap.c +++ b/drivers/net/ethernet/microchip/encx24j600-regmap.c @@ -497,13 +497,19 @@ static struct regmap_bus phymap_encx24j600 = { .reg_read = regmap_encx24j600_phy_reg_read, }; -void devm_regmap_init_encx24j600(struct device *dev, - struct encx24j600_context *ctx) +int devm_regmap_init_encx24j600(struct device *dev, + struct encx24j600_context *ctx) { mutex_init(&ctx->mutex); regcfg.lock_arg = ctx; ctx->regmap = devm_regmap_init(dev, ®map_encx24j600, ctx, ®cfg); + if (IS_ERR(ctx->regmap)) + return PTR_ERR(ctx->regmap); ctx->phymap = devm_regmap_init(dev, &phymap_encx24j600, ctx, &phycfg); + if (IS_ERR(ctx->phymap)) + return PTR_ERR(ctx->phymap); + + return 0; } EXPORT_SYMBOL_GPL(devm_regmap_init_encx24j600); diff --git a/drivers/net/ethernet/microchip/encx24j600.c b/drivers/net/ethernet/microchip/encx24j600.c index ee921a99e439..0bc6b3176fbf 100644 --- a/drivers/net/ethernet/microchip/encx24j600.c +++ b/drivers/net/ethernet/microchip/encx24j600.c @@ -1023,10 +1023,13 @@ static int encx24j600_spi_probe(struct spi_device *spi) priv->speed = SPEED_100; priv->ctx.spi = spi; - devm_regmap_init_encx24j600(&spi->dev, &priv->ctx); ndev->irq = spi->irq; ndev->netdev_ops = &encx24j600_netdev_ops; + ret = devm_regmap_init_encx24j600(&spi->dev, &priv->ctx); + if (ret) + goto out_free; + mutex_init(&priv->lock); /* Reset device and check if it is connected */ diff --git a/drivers/net/ethernet/microchip/encx24j600_hw.h b/drivers/net/ethernet/microchip/encx24j600_hw.h index fac61a8fbd02..34c5a289898c 100644 --- a/drivers/net/ethernet/microchip/encx24j600_hw.h +++ b/drivers/net/ethernet/microchip/encx24j600_hw.h @@ -15,8 +15,8 @@ struct encx24j600_context { int bank; }; -void devm_regmap_init_encx24j600(struct device *dev, - struct encx24j600_context *ctx); +int devm_regmap_init_encx24j600(struct device *dev, + struct encx24j600_context *ctx); /* Single-byte instructions */ #define BANK_SELECT(bank) (0xC0 | ((bank & (BANK_MASK >> BANK_SHIFT)) << 1)) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 9e8561cdc32a..4d5a5d6595b3 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1743,6 +1743,16 @@ static int lan743x_tx_ring_init(struct lan743x_tx *tx) ret = -EINVAL; goto cleanup; } + if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev, + DMA_BIT_MASK(64))) { + if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev, + DMA_BIT_MASK(32))) { + dev_warn(&tx->adapter->pdev->dev, + "lan743x_: No suitable DMA available\n"); + ret = -ENOMEM; + goto cleanup; + } + } ring_allocation_size = ALIGN(tx->ring_size * sizeof(struct lan743x_tx_descriptor), PAGE_SIZE); @@ -1934,7 +1944,8 @@ static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index) index); } -static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index) +static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index, + gfp_t gfp) { struct net_device *netdev = rx->adapter->netdev; struct device *dev = &rx->adapter->pdev->dev; @@ -1948,7 +1959,7 @@ static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index) descriptor = &rx->ring_cpu_ptr[index]; buffer_info = &rx->buffer_info[index]; - skb = __netdev_alloc_skb(netdev, buffer_length, GFP_ATOMIC | GFP_DMA); + skb = __netdev_alloc_skb(netdev, buffer_length, gfp); if (!skb) return -ENOMEM; dma_ptr = dma_map_single(dev, skb->data, buffer_length, DMA_FROM_DEVICE); @@ -2110,7 +2121,8 @@ static int lan743x_rx_process_buffer(struct lan743x_rx *rx) /* save existing skb, allocate new skb and map to dma */ skb = buffer_info->skb; - if (lan743x_rx_init_ring_element(rx, rx->last_head)) { + if (lan743x_rx_init_ring_element(rx, rx->last_head, + GFP_ATOMIC | GFP_DMA)) { /* failed to allocate next skb. * Memory is very low. * Drop this packet and reuse buffer. @@ -2276,6 +2288,16 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx) ret = -EINVAL; goto cleanup; } + if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev, + DMA_BIT_MASK(64))) { + if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev, + DMA_BIT_MASK(32))) { + dev_warn(&rx->adapter->pdev->dev, + "lan743x_: No suitable DMA available\n"); + ret = -ENOMEM; + goto cleanup; + } + } ring_allocation_size = ALIGN(rx->ring_size * sizeof(struct lan743x_rx_descriptor), PAGE_SIZE); @@ -2315,13 +2337,16 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx) rx->last_head = 0; for (index = 0; index < rx->ring_size; index++) { - ret = lan743x_rx_init_ring_element(rx, index); + ret = lan743x_rx_init_ring_element(rx, index, GFP_KERNEL); if (ret) goto cleanup; } return 0; cleanup: + netif_warn(rx->adapter, ifup, rx->adapter->netdev, + "Error allocating memory for LAN743x\n"); + lan743x_rx_ring_cleanup(rx); return ret; } @@ -3019,6 +3044,8 @@ static int lan743x_pm_resume(struct device *dev) if (ret) { netif_err(adapter, probe, adapter->netdev, "lan743x_hardware_init returned %d\n", ret); + lan743x_pci_cleanup(adapter); + return ret; } /* open netdev when netdev is at running state while resume. diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c index cbece6e9bff2..5030dfca3879 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c @@ -758,6 +758,7 @@ static int mchp_sparx5_probe(struct platform_device *pdev) err = dev_err_probe(sparx5->dev, PTR_ERR(serdes), "port %u: missing serdes\n", portno); + of_node_put(portnp); goto cleanup_config; } config->portno = portno; diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 1b21030308e5..030ae89f3a33 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1477,8 +1477,10 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, if (err) goto out; - if (cq->gdma_id >= gc->max_num_cqs) + if (WARN_ON(cq->gdma_id >= gc->max_num_cqs)) { + err = -EINVAL; goto out; + } gc->cq_table[cq->gdma_id] = cq->gdma_cq; diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 559177e6ded4..a08e4f530c1c 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -472,9 +472,9 @@ void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port, !(quirks & OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP)) ocelot_port_rmwl(ocelot_port, DEV_CLOCK_CFG_MAC_TX_RST | - DEV_CLOCK_CFG_MAC_TX_RST, + DEV_CLOCK_CFG_MAC_RX_RST, DEV_CLOCK_CFG_MAC_TX_RST | - DEV_CLOCK_CFG_MAC_TX_RST, + DEV_CLOCK_CFG_MAC_RX_RST, DEV_CLOCK_CFG); } EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_down); @@ -569,49 +569,44 @@ void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port, } EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_up); -static void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, - struct sk_buff *clone) +static int ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *clone) { struct ocelot_port *ocelot_port = ocelot->ports[port]; + unsigned long flags; + + spin_lock_irqsave(&ocelot->ts_id_lock, flags); - spin_lock(&ocelot_port->ts_id_lock); + if (ocelot_port->ptp_skbs_in_flight == OCELOT_MAX_PTP_ID || + ocelot->ptp_skbs_in_flight == OCELOT_PTP_FIFO_SIZE) { + spin_unlock_irqrestore(&ocelot->ts_id_lock, flags); + return -EBUSY; + } skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS; /* Store timestamp ID in OCELOT_SKB_CB(clone)->ts_id */ OCELOT_SKB_CB(clone)->ts_id = ocelot_port->ts_id; - ocelot_port->ts_id = (ocelot_port->ts_id + 1) % 4; - skb_queue_tail(&ocelot_port->tx_skbs, clone); - spin_unlock(&ocelot_port->ts_id_lock); -} + ocelot_port->ts_id++; + if (ocelot_port->ts_id == OCELOT_MAX_PTP_ID) + ocelot_port->ts_id = 0; -u32 ocelot_ptp_rew_op(struct sk_buff *skb) -{ - struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; - u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd; - u32 rew_op = 0; + ocelot_port->ptp_skbs_in_flight++; + ocelot->ptp_skbs_in_flight++; - if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) { - rew_op = ptp_cmd; - rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; - } else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { - rew_op = ptp_cmd; - } + skb_queue_tail(&ocelot_port->tx_skbs, clone); - return rew_op; + spin_unlock_irqrestore(&ocelot->ts_id_lock, flags); + + return 0; } -EXPORT_SYMBOL(ocelot_ptp_rew_op); -static bool ocelot_ptp_is_onestep_sync(struct sk_buff *skb) +static bool ocelot_ptp_is_onestep_sync(struct sk_buff *skb, + unsigned int ptp_class) { struct ptp_header *hdr; - unsigned int ptp_class; u8 msgtype, twostep; - ptp_class = ptp_classify_raw(skb); - if (ptp_class == PTP_CLASS_NONE) - return false; - hdr = ptp_parse_header(skb, ptp_class); if (!hdr) return false; @@ -631,10 +626,20 @@ int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, { struct ocelot_port *ocelot_port = ocelot->ports[port]; u8 ptp_cmd = ocelot_port->ptp_cmd; + unsigned int ptp_class; + int err; + + /* Don't do anything if PTP timestamping not enabled */ + if (!ptp_cmd) + return 0; + + ptp_class = ptp_classify_raw(skb); + if (ptp_class == PTP_CLASS_NONE) + return -EINVAL; /* Store ptp_cmd in OCELOT_SKB_CB(skb)->ptp_cmd */ if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { - if (ocelot_ptp_is_onestep_sync(skb)) { + if (ocelot_ptp_is_onestep_sync(skb, ptp_class)) { OCELOT_SKB_CB(skb)->ptp_cmd = ptp_cmd; return 0; } @@ -648,8 +653,12 @@ int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, if (!(*clone)) return -ENOMEM; - ocelot_port_add_txtstamp_skb(ocelot, port, *clone); + err = ocelot_port_add_txtstamp_skb(ocelot, port, *clone); + if (err) + return err; + OCELOT_SKB_CB(skb)->ptp_cmd = ptp_cmd; + OCELOT_SKB_CB(*clone)->ptp_class = ptp_class; } return 0; @@ -683,6 +692,17 @@ static void ocelot_get_hwtimestamp(struct ocelot *ocelot, spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); } +static bool ocelot_validate_ptp_skb(struct sk_buff *clone, u16 seqid) +{ + struct ptp_header *hdr; + + hdr = ptp_parse_header(clone, OCELOT_SKB_CB(clone)->ptp_class); + if (WARN_ON(!hdr)) + return false; + + return seqid == ntohs(hdr->sequence_id); +} + void ocelot_get_txtstamp(struct ocelot *ocelot) { int budget = OCELOT_PTP_QUEUE_SZ; @@ -690,10 +710,10 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) while (budget--) { struct sk_buff *skb, *skb_tmp, *skb_match = NULL; struct skb_shared_hwtstamps shhwtstamps; + u32 val, id, seqid, txport; struct ocelot_port *port; struct timespec64 ts; unsigned long flags; - u32 val, id, txport; val = ocelot_read(ocelot, SYS_PTP_STATUS); @@ -706,10 +726,17 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) /* Retrieve the ts ID and Tx port */ id = SYS_PTP_STATUS_PTP_MESS_ID_X(val); txport = SYS_PTP_STATUS_PTP_MESS_TXPORT_X(val); + seqid = SYS_PTP_STATUS_PTP_MESS_SEQ_ID(val); - /* Retrieve its associated skb */ port = ocelot->ports[txport]; + spin_lock(&ocelot->ts_id_lock); + port->ptp_skbs_in_flight--; + ocelot->ptp_skbs_in_flight--; + spin_unlock(&ocelot->ts_id_lock); + + /* Retrieve its associated skb */ +try_again: spin_lock_irqsave(&port->tx_skbs.lock, flags); skb_queue_walk_safe(&port->tx_skbs, skb, skb_tmp) { @@ -722,12 +749,20 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) spin_unlock_irqrestore(&port->tx_skbs.lock, flags); + if (WARN_ON(!skb_match)) + continue; + + if (!ocelot_validate_ptp_skb(skb_match, seqid)) { + dev_err_ratelimited(ocelot->dev, + "port %d received stale TX timestamp for seqid %d, discarding\n", + txport, seqid); + dev_kfree_skb_any(skb); + goto try_again; + } + /* Get the h/w timestamp */ ocelot_get_hwtimestamp(ocelot, &ts); - if (unlikely(!skb_match)) - continue; - /* Set the timestamp into the skb */ memset(&shhwtstamps, 0, sizeof(shhwtstamps)); shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec); @@ -1948,7 +1983,6 @@ void ocelot_init_port(struct ocelot *ocelot, int port) struct ocelot_port *ocelot_port = ocelot->ports[port]; skb_queue_head_init(&ocelot_port->tx_skbs); - spin_lock_init(&ocelot_port->ts_id_lock); /* Basic L2 initialization */ @@ -2081,6 +2115,7 @@ int ocelot_init(struct ocelot *ocelot) mutex_init(&ocelot->stats_lock); mutex_init(&ocelot->ptp_lock); spin_lock_init(&ocelot->ptp_clock_lock); + spin_lock_init(&ocelot->ts_id_lock); snprintf(queue_name, sizeof(queue_name), "%s-stats", dev_name(ocelot->dev)); ocelot->stats_queue = create_singlethread_workqueue(queue_name); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index e54b9fb2a97a..2545727fd5b2 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -8,6 +8,7 @@ * Copyright 2020-2021 NXP */ +#include <linux/dsa/ocelot.h> #include <linux/if_bridge.h> #include <linux/of_net.h> #include <linux/phy/phy.h> @@ -1625,7 +1626,7 @@ static int ocelot_port_phylink_create(struct ocelot *ocelot, int port, if (phy_mode == PHY_INTERFACE_MODE_QSGMII) ocelot_port_rmwl(ocelot_port, 0, DEV_CLOCK_CFG_MAC_TX_RST | - DEV_CLOCK_CFG_MAC_TX_RST, + DEV_CLOCK_CFG_MAC_RX_RST, DEV_CLOCK_CFG); ocelot_port->phy_mode = phy_mode; diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index 7945393a0655..99d7376a70a7 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -998,8 +998,8 @@ ocelot_vcap_block_find_filter_by_index(struct ocelot_vcap_block *block, } struct ocelot_vcap_filter * -ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int cookie, - bool tc_offload) +ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, + unsigned long cookie, bool tc_offload) { struct ocelot_vcap_filter *filter; diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 291ae6817c26..d51f799e4e86 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -969,6 +969,7 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev, target = ocelot_regmap_init(ocelot, res); if (IS_ERR(target)) { err = PTR_ERR(target); + of_node_put(portnp); goto out_teardown; } diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 09c0e839cca5..3b6b2e61139e 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -8566,7 +8566,7 @@ static void s2io_io_resume(struct pci_dev *pdev) return; } - if (s2io_set_mac_addr(netdev, netdev->dev_addr) == FAILURE) { + if (do_s2io_prog_unicast(netdev, netdev->dev_addr) == FAILURE) { s2io_card_down(sp); pr_err("Can't restore mac addr after reset.\n"); return; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 11c83a99b014..f469950c7265 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -182,15 +182,21 @@ static int nfp_bpf_check_mtu(struct nfp_app *app, struct net_device *netdev, int new_mtu) { struct nfp_net *nn = netdev_priv(netdev); - unsigned int max_mtu; + struct nfp_bpf_vnic *bv; + struct bpf_prog *prog; if (~nn->dp.ctrl & NFP_NET_CFG_CTRL_BPF) return 0; - max_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32; - if (new_mtu > max_mtu) { - nn_info(nn, "BPF offload active, MTU over %u not supported\n", - max_mtu); + if (nn->xdp_hw.prog) { + prog = nn->xdp_hw.prog; + } else { + bv = nn->app_priv; + prog = bv->tc_prog; + } + + if (nfp_bpf_offload_check_mtu(nn, prog, new_mtu)) { + nn_info(nn, "BPF offload active, potential packet access beyond hardware packet boundary"); return -EBUSY; } return 0; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index d0e17eebddd9..16841bb750b7 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -560,6 +560,8 @@ bool nfp_is_subprog_start(struct nfp_insn_meta *meta); void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog); int nfp_bpf_jit(struct nfp_prog *prog); bool nfp_bpf_supported_opcode(u8 code); +bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog, + unsigned int mtu); int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 53851853562c..9d97cd281f18 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -481,19 +481,28 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, return 0; } +bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog, + unsigned int mtu) +{ + unsigned int fw_mtu, pkt_off; + + fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32; + pkt_off = min(prog->aux->max_pkt_offset, mtu); + + return fw_mtu < pkt_off; +} + static int nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; - unsigned int fw_mtu, pkt_off, max_stack, max_prog_len; + unsigned int max_stack, max_prog_len; dma_addr_t dma_addr; void *img; int err; - fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32; - pkt_off = min(prog->aux->max_pkt_offset, nn->dp.netdev->mtu); - if (fw_mtu < pkt_off) { + if (nfp_bpf_offload_check_mtu(nn, prog, nn->dp.netdev->mtu)) { NL_SET_ERR_MSG_MOD(extack, "BPF offload not supported with potential packet access beyond HW packet split boundary"); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index c029950a81e2..ac1dcfa1d179 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -830,10 +830,6 @@ static int nfp_flower_init(struct nfp_app *app) if (err) goto err_cleanup; - err = flow_indr_dev_register(nfp_flower_indr_setup_tc_cb, app); - if (err) - goto err_cleanup; - if (app_priv->flower_ext_feats & NFP_FL_FEATS_VF_RLIM) nfp_flower_qos_init(app); @@ -942,7 +938,20 @@ static int nfp_flower_start(struct nfp_app *app) return err; } - return nfp_tunnel_config_start(app); + err = flow_indr_dev_register(nfp_flower_indr_setup_tc_cb, app); + if (err) + return err; + + err = nfp_tunnel_config_start(app); + if (err) + goto err_tunnel_config; + + return 0; + +err_tunnel_config: + flow_indr_dev_unregister(nfp_flower_indr_setup_tc_cb, app, + nfp_flower_setup_indr_tc_release); + return err; } static void nfp_flower_stop(struct nfp_app *app) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c index 2643ea5948f4..154399c5453f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c @@ -196,7 +196,7 @@ int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg, } reg->dst_lmextn = swreg_lmextn(dst); - reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg); + reg->src_lmextn = swreg_lmextn(lreg) || swreg_lmextn(rreg); return 0; } @@ -277,7 +277,7 @@ int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg, } reg->dst_lmextn = swreg_lmextn(dst); - reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg); + reg->src_lmextn = swreg_lmextn(lreg) || swreg_lmextn(rreg); return 0; } diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index d29fe562b3de..c910fa2f40a4 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1015,9 +1015,6 @@ static int lpc_eth_close(struct net_device *ndev) napi_disable(&pldat->napi); netif_stop_queue(ndev); - if (ndev->phydev) - phy_stop(ndev->phydev); - spin_lock_irqsave(&pldat->lock, flags); __lpc_eth_reset(pldat); netif_carrier_off(ndev); @@ -1025,6 +1022,8 @@ static int lpc_eth_close(struct net_device *ndev) writel(0, LPC_ENET_MAC2(pldat->net_base)); spin_unlock_irqrestore(&pldat->lock, flags); + if (ndev->phydev) + phy_stop(ndev->phydev); clk_disable_unprepare(pldat->clk); return 0; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 381966e8f557..7f3322ce044c 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1292,8 +1292,10 @@ int ionic_lif_addr_add(struct ionic_lif *lif, const u8 *addr) if (err && err != -EEXIST) { /* set the state back to NEW so we can try again later */ f = ionic_rx_filter_by_addr(lif, addr); - if (f && f->state == IONIC_FILTER_STATE_SYNCED) + if (f && f->state == IONIC_FILTER_STATE_SYNCED) { f->state = IONIC_FILTER_STATE_NEW; + set_bit(IONIC_LIF_F_FILTER_SYNC_NEEDED, lif->state); + } spin_unlock_bh(&lif->rx_filters.lock); @@ -1377,6 +1379,10 @@ static int ionic_addr_add(struct net_device *netdev, const u8 *addr) static int ionic_addr_del(struct net_device *netdev, const u8 *addr) { + /* Don't delete our own address from the uc list */ + if (ether_addr_equal(addr, netdev->dev_addr)) + return 0; + return ionic_lif_list_addr(netdev_priv(netdev), addr, DEL_ADDR); } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c index 25ecfcfa1281..69728f9013cb 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c @@ -349,9 +349,6 @@ loop_out: list_for_each_entry_safe(sync_item, spos, &sync_add_list, list) { (void)ionic_lif_addr_add(lif, sync_item->f.cmd.mac.addr); - if (sync_item->f.state != IONIC_FILTER_STATE_SYNCED) - set_bit(IONIC_LIF_F_FILTER_SYNC_NEEDED, lif->state); - list_del(&sync_item->list); devm_kfree(dev, sync_item); } diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 15ef59aa34ff..d10e1cd6d2ba 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1299,6 +1299,7 @@ static int qed_slowpath_start(struct qed_dev *cdev, } else { DP_NOTICE(cdev, "Failed to acquire PTT for aRFS\n"); + rc = -EINVAL; goto err; } } diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 46a6ff9a782d..2918947dd57c 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -157,6 +157,7 @@ static const struct pci_device_id rtl8169_pci_tbl[] = { { PCI_VDEVICE(REALTEK, 0x8129) }, { PCI_VDEVICE(REALTEK, 0x8136), RTL_CFG_NO_GBIT }, { PCI_VDEVICE(REALTEK, 0x8161) }, + { PCI_VDEVICE(REALTEK, 0x8162) }, { PCI_VDEVICE(REALTEK, 0x8167) }, { PCI_VDEVICE(REALTEK, 0x8168) }, { PCI_VDEVICE(NCUBE, 0x8168) }, diff --git a/drivers/net/ethernet/sfc/mcdi_port_common.c b/drivers/net/ethernet/sfc/mcdi_port_common.c index 4bd3ef8f3384..c4fe3c48ac46 100644 --- a/drivers/net/ethernet/sfc/mcdi_port_common.c +++ b/drivers/net/ethernet/sfc/mcdi_port_common.c @@ -132,16 +132,27 @@ void mcdi_to_ethtool_linkset(u32 media, u32 cap, unsigned long *linkset) case MC_CMD_MEDIA_SFP_PLUS: case MC_CMD_MEDIA_QSFP_PLUS: SET_BIT(FIBRE); - if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) + if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) { SET_BIT(1000baseT_Full); - if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) - SET_BIT(10000baseT_Full); - if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN)) + SET_BIT(1000baseX_Full); + } + if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) { + SET_BIT(10000baseCR_Full); + SET_BIT(10000baseLR_Full); + SET_BIT(10000baseSR_Full); + } + if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN)) { SET_BIT(40000baseCR4_Full); - if (cap & (1 << MC_CMD_PHY_CAP_100000FDX_LBN)) + SET_BIT(40000baseSR4_Full); + } + if (cap & (1 << MC_CMD_PHY_CAP_100000FDX_LBN)) { SET_BIT(100000baseCR4_Full); - if (cap & (1 << MC_CMD_PHY_CAP_25000FDX_LBN)) + SET_BIT(100000baseSR4_Full); + } + if (cap & (1 << MC_CMD_PHY_CAP_25000FDX_LBN)) { SET_BIT(25000baseCR_Full); + SET_BIT(25000baseSR_Full); + } if (cap & (1 << MC_CMD_PHY_CAP_50000FDX_LBN)) SET_BIT(50000baseCR2_Full); break; @@ -192,15 +203,19 @@ u32 ethtool_linkset_to_mcdi_cap(const unsigned long *linkset) result |= (1 << MC_CMD_PHY_CAP_100FDX_LBN); if (TEST_BIT(1000baseT_Half)) result |= (1 << MC_CMD_PHY_CAP_1000HDX_LBN); - if (TEST_BIT(1000baseT_Full) || TEST_BIT(1000baseKX_Full)) + if (TEST_BIT(1000baseT_Full) || TEST_BIT(1000baseKX_Full) || + TEST_BIT(1000baseX_Full)) result |= (1 << MC_CMD_PHY_CAP_1000FDX_LBN); - if (TEST_BIT(10000baseT_Full) || TEST_BIT(10000baseKX4_Full)) + if (TEST_BIT(10000baseT_Full) || TEST_BIT(10000baseKX4_Full) || + TEST_BIT(10000baseCR_Full) || TEST_BIT(10000baseLR_Full) || + TEST_BIT(10000baseSR_Full)) result |= (1 << MC_CMD_PHY_CAP_10000FDX_LBN); - if (TEST_BIT(40000baseCR4_Full) || TEST_BIT(40000baseKR4_Full)) + if (TEST_BIT(40000baseCR4_Full) || TEST_BIT(40000baseKR4_Full) || + TEST_BIT(40000baseSR4_Full)) result |= (1 << MC_CMD_PHY_CAP_40000FDX_LBN); - if (TEST_BIT(100000baseCR4_Full)) + if (TEST_BIT(100000baseCR4_Full) || TEST_BIT(100000baseSR4_Full)) result |= (1 << MC_CMD_PHY_CAP_100000FDX_LBN); - if (TEST_BIT(25000baseCR_Full)) + if (TEST_BIT(25000baseCR_Full) || TEST_BIT(25000baseSR_Full)) result |= (1 << MC_CMD_PHY_CAP_25000FDX_LBN); if (TEST_BIT(50000baseCR2_Full)) result |= (1 << MC_CMD_PHY_CAP_50000FDX_LBN); diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index a39c5143b386..797e51802ccb 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -648,7 +648,7 @@ static int efx_ptp_get_attributes(struct efx_nic *efx) } else if (rc == -EINVAL) { fmt = MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_NANOSECONDS; } else if (rc == -EPERM) { - netif_info(efx, probe, efx->net_dev, "no PTP support\n"); + pci_info(efx->pci_dev, "no PTP support\n"); return rc; } else { efx_mcdi_display_error(efx, MC_CMD_PTP, sizeof(inbuf), @@ -824,7 +824,7 @@ static int efx_ptp_disable(struct efx_nic *efx) * should only have been called during probe. */ if (rc == -ENOSYS || rc == -EPERM) - netif_info(efx, probe, efx->net_dev, "no PTP support\n"); + pci_info(efx->pci_dev, "no PTP support\n"); else if (rc) efx_mcdi_display_error(efx, MC_CMD_PTP, MC_CMD_PTP_IN_DISABLE_LEN, diff --git a/drivers/net/ethernet/sfc/siena_sriov.c b/drivers/net/ethernet/sfc/siena_sriov.c index 83dcfcae3d4b..441e7f3e5375 100644 --- a/drivers/net/ethernet/sfc/siena_sriov.c +++ b/drivers/net/ethernet/sfc/siena_sriov.c @@ -1057,7 +1057,7 @@ void efx_siena_sriov_probe(struct efx_nic *efx) return; if (efx_siena_sriov_cmd(efx, false, &efx->vi_scale, &count)) { - netif_info(efx, probe, efx->net_dev, "no SR-IOV VFs probed\n"); + pci_info(efx->pci_dev, "no SR-IOV VFs probed\n"); return; } if (count > 0 && count > max_vfs) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c index fbfda55b4c52..5e731a72cce8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c @@ -71,6 +71,7 @@ err_remove_config_dt: static const struct of_device_id dwmac_generic_match[] = { { .compatible = "st,spear600-gmac"}, + { .compatible = "snps,dwmac-3.40a"}, { .compatible = "snps,dwmac-3.50a"}, { .compatible = "snps,dwmac-3.610"}, { .compatible = "snps,dwmac-3.70a"}, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index ed817011a94a..6924a6aacbd5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -21,6 +21,7 @@ #include <linux/delay.h> #include <linux/mfd/syscon.h> #include <linux/regmap.h> +#include <linux/pm_runtime.h> #include "stmmac_platform.h" @@ -1528,6 +1529,8 @@ static int rk_gmac_powerup(struct rk_priv_data *bsp_priv) return ret; } + pm_runtime_get_sync(dev); + if (bsp_priv->integrated_phy) rk_gmac_integrated_phy_powerup(bsp_priv); @@ -1539,6 +1542,8 @@ static void rk_gmac_powerdown(struct rk_priv_data *gmac) if (gmac->integrated_phy) rk_gmac_integrated_phy_powerdown(gmac); + pm_runtime_put_sync(&gmac->pdev->dev); + phy_power_on(gmac, false); gmac_clk_enable(gmac, false); } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c index 90383abafa66..f5581db0ba9b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c @@ -218,11 +218,18 @@ static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) readl(ioaddr + DMA_BUS_MODE + i * 4); } -static void dwmac1000_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwmac1000_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap = readl(ioaddr + DMA_HW_FEATURE); + if (!hw_cap) { + /* 0x00000000 is the value read on old hardware that does not + * implement this register + */ + return -EOPNOTSUPP; + } + dma_cap->mbps_10_100 = (hw_cap & DMA_HW_FEAT_MIISEL); dma_cap->mbps_1000 = (hw_cap & DMA_HW_FEAT_GMIISEL) >> 1; dma_cap->half_duplex = (hw_cap & DMA_HW_FEAT_HDSEL) >> 2; @@ -252,6 +259,8 @@ static void dwmac1000_get_hw_feature(void __iomem *ioaddr, dma_cap->number_tx_channel = (hw_cap & DMA_HW_FEAT_TXCHCNT) >> 22; /* Alternate (enhanced) DESC mode */ dma_cap->enh_desc = (hw_cap & DMA_HW_FEAT_ENHDESSEL) >> 24; + + return 0; } static void dwmac1000_rx_watchdog(void __iomem *ioaddr, u32 riwt, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index 5be8e6a631d9..d99fa028c646 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -347,8 +347,8 @@ static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode, writel(mtl_tx_op, ioaddr + MTL_CHAN_TX_OP_MODE(channel)); } -static void dwmac4_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwmac4_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap = readl(ioaddr + GMAC_HW_FEATURE0); @@ -437,6 +437,8 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr, dma_cap->frpbs = (hw_cap & GMAC_HW_FEAT_FRPBS) >> 11; dma_cap->frpsel = (hw_cap & GMAC_HW_FEAT_FRPSEL) >> 10; dma_cap->dvlan = (hw_cap & GMAC_HW_FEAT_DVLAN) >> 5; + + return 0; } /* Enable/disable TSO feature and set MSS */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 906e985441a9..5e98355f422b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -371,8 +371,8 @@ static int dwxgmac2_dma_interrupt(void __iomem *ioaddr, return ret; } -static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwxgmac2_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap; @@ -445,6 +445,8 @@ static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, dma_cap->frpes = (hw_cap & XGMAC_HWFEAT_FRPES) >> 11; dma_cap->frpbs = (hw_cap & XGMAC_HWFEAT_FRPPB) >> 9; dma_cap->frpsel = (hw_cap & XGMAC_HWFEAT_FRPSEL) >> 3; + + return 0; } static void dwxgmac2_rx_watchdog(void __iomem *ioaddr, u32 riwt, u32 queue) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 6dc1c98ebec8..fe2660d5694d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -203,8 +203,8 @@ struct stmmac_dma_ops { int (*dma_interrupt) (void __iomem *ioaddr, struct stmmac_extra_stats *x, u32 chan, u32 dir); /* If supported then get the optional core features */ - void (*get_hw_feature)(void __iomem *ioaddr, - struct dma_features *dma_cap); + int (*get_hw_feature)(void __iomem *ioaddr, + struct dma_features *dma_cap); /* Program the HW RX Watchdog */ void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 queue); void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan); @@ -255,7 +255,7 @@ struct stmmac_dma_ops { #define stmmac_dma_interrupt_status(__priv, __args...) \ stmmac_do_callback(__priv, dma, dma_interrupt, __args) #define stmmac_get_hw_feature(__priv, __args...) \ - stmmac_do_void_callback(__priv, dma, get_hw_feature, __args) + stmmac_do_callback(__priv, dma, get_hw_feature, __args) #define stmmac_rx_watchdog(__priv, __args...) \ stmmac_do_void_callback(__priv, dma, rx_watchdog, __args) #define stmmac_set_tx_ring_len(__priv, __args...) \ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 981ccf47dcea..3d67d1fa3690 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -477,6 +477,10 @@ bool stmmac_eee_init(struct stmmac_priv *priv) stmmac_lpi_entry_timer_config(priv, 0); del_timer_sync(&priv->eee_ctrl_timer); stmmac_set_eee_timer(priv, priv->hw, 0, eee_tw_timer); + if (priv->hw->xpcs) + xpcs_config_eee(priv->hw->xpcs, + priv->plat->mult_fact_100ns, + false); } mutex_unlock(&priv->lock); return false; @@ -732,7 +736,7 @@ static int stmmac_hwtstamp_set(struct net_device *dev, struct ifreq *ifr) config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; ptp_v2 = PTP_TCR_TSVER2ENA; snap_type_sel = PTP_TCR_SNAPTYPSEL_1; - if (priv->synopsys_id != DWMAC_CORE_5_10) + if (priv->synopsys_id < DWMAC_CORE_4_10) ts_event_en = PTP_TCR_TSEVNTENA; ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA; ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA; @@ -1038,7 +1042,7 @@ static void stmmac_mac_link_down(struct phylink_config *config, stmmac_mac_set(priv, priv->ioaddr, false); priv->eee_active = false; priv->tx_lpi_enabled = false; - stmmac_eee_init(priv); + priv->eee_enabled = stmmac_eee_init(priv); stmmac_set_eee_pls(priv, priv->hw, false); if (priv->dma_cap.fpesel) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 62cec9bfcd33..232ac98943cd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -508,6 +508,14 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) plat->pmt = 1; } + if (of_device_is_compatible(np, "snps,dwmac-3.40a")) { + plat->has_gmac = 1; + plat->enh_desc = 1; + plat->tx_coe = 1; + plat->bugged_jumbo = 1; + plat->pmt = 1; + } + if (of_device_is_compatible(np, "snps,dwmac-4.00") || of_device_is_compatible(np, "snps,dwmac-4.10a") || of_device_is_compatible(np, "snps,dwmac-4.20a") || diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 775dcf4ebde5..6b6f28d5b8d5 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -623,16 +623,16 @@ static int receive(struct net_device *dev, int cnt) /* --------------------------------------------------------------------- */ -#ifdef __i386__ +#if defined(__i386__) && !defined(CONFIG_UML) #include <asm/msr.h> #define GETTICK(x) \ ({ \ if (boot_cpu_has(X86_FEATURE_TSC)) \ x = (unsigned int)rdtsc(); \ }) -#else /* __i386__ */ +#else /* __i386__ && !CONFIG_UML */ #define GETTICK(x) -#endif /* __i386__ */ +#endif /* __i386__ && !CONFIG_UML */ static void epp_bh(struct work_struct *work) { diff --git a/drivers/net/ipa/Kconfig b/drivers/net/ipa/Kconfig index 8f99cfa14680..d037682fb7ad 100644 --- a/drivers/net/ipa/Kconfig +++ b/drivers/net/ipa/Kconfig @@ -4,6 +4,7 @@ config QCOM_IPA depends on ARCH_QCOM || COMPILE_TEST depends on QCOM_RPROC_COMMON || (QCOM_RPROC_COMMON=n && COMPILE_TEST) select QCOM_MDT_LOADER if ARCH_QCOM + select QCOM_SCM select QCOM_QMI_HELPERS help Choose Y or M here to include support for the Qualcomm diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index fb0a83dc09ac..7de631f5356f 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -666,6 +666,10 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) { int ret; + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL0); + if (ret < 0) + return ret; + if (enable) { /* Enable EEE */ ret = DW_VR_MII_EEE_LTX_EN | DW_VR_MII_EEE_LRX_EN | @@ -673,9 +677,6 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) DW_VR_MII_EEE_TX_EN_CTRL | DW_VR_MII_EEE_RX_EN_CTRL | mult_fact_100ns << DW_VR_MII_EEE_MULT_FACT_100NS_SHIFT; } else { - ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL0); - if (ret < 0) - return ret; ret &= ~(DW_VR_MII_EEE_LTX_EN | DW_VR_MII_EEE_LRX_EN | DW_VR_MII_EEE_TX_QUIET_EN | DW_VR_MII_EEE_RX_QUIET_EN | DW_VR_MII_EEE_TX_EN_CTRL | DW_VR_MII_EEE_RX_EN_CTRL | @@ -690,21 +691,28 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) if (ret < 0) return ret; - ret |= DW_VR_MII_EEE_TRN_LPI; + if (enable) + ret |= DW_VR_MII_EEE_TRN_LPI; + else + ret &= ~DW_VR_MII_EEE_TRN_LPI; + return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL1, ret); } EXPORT_SYMBOL_GPL(xpcs_config_eee); static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, unsigned int mode) { - int ret; + int ret, mdio_ctrl; /* For AN for C37 SGMII mode, the settings are :- - * 1) VR_MII_AN_CTRL Bit(2:1)[PCS_MODE] = 10b (SGMII AN) - * 2) VR_MII_AN_CTRL Bit(3) [TX_CONFIG] = 0b (MAC side SGMII) + * 1) VR_MII_MMD_CTRL Bit(12) [AN_ENABLE] = 0b (Disable SGMII AN in case + it is already enabled) + * 2) VR_MII_AN_CTRL Bit(2:1)[PCS_MODE] = 10b (SGMII AN) + * 3) VR_MII_AN_CTRL Bit(3) [TX_CONFIG] = 0b (MAC side SGMII) * DW xPCS used with DW EQoS MAC is always MAC side SGMII. - * 3) VR_MII_DIG_CTRL1 Bit(9) [MAC_AUTO_SW] = 1b (Automatic + * 4) VR_MII_DIG_CTRL1 Bit(9) [MAC_AUTO_SW] = 1b (Automatic * speed/duplex mode change by HW after SGMII AN complete) + * 5) VR_MII_MMD_CTRL Bit(12) [AN_ENABLE] = 1b (Enable SGMII AN) * * Note: Since it is MAC side SGMII, there is no need to set * SR_MII_AN_ADV. MAC side SGMII receives AN Tx Config from @@ -712,6 +720,17 @@ static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, unsigned int mode) * between PHY and Link Partner. There is also no need to * trigger AN restart for MAC-side SGMII. */ + mdio_ctrl = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL); + if (mdio_ctrl < 0) + return mdio_ctrl; + + if (mdio_ctrl & AN_CL37_EN) { + ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, + mdio_ctrl & ~AN_CL37_EN); + if (ret < 0) + return ret; + } + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL); if (ret < 0) return ret; @@ -736,7 +755,15 @@ static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs, unsigned int mode) else ret &= ~DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; - return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret); + ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret); + if (ret < 0) + return ret; + + if (phylink_autoneg_inband(mode)) + ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, + mdio_ctrl | AN_CL37_EN); + + return ret; } static int xpcs_config_2500basex(struct dw_xpcs *xpcs) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index e37edf70f2f8..6865d9319197 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -538,10 +538,16 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) bus->dev.groups = NULL; dev_set_name(&bus->dev, "%s", bus->id); + /* We need to set state to MDIOBUS_UNREGISTERED to correctly release + * the device in mdiobus_free() + * + * State will be updated later in this function in case of success + */ + bus->state = MDIOBUS_UNREGISTERED; + err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); - put_device(&bus->dev); return -EINVAL; } diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index f124a8a58bd4..a3bfb156c83d 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -243,62 +243,10 @@ static void phy_sanitize_settings(struct phy_device *phydev) } } -int phy_ethtool_ksettings_set(struct phy_device *phydev, - const struct ethtool_link_ksettings *cmd) -{ - __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); - u8 autoneg = cmd->base.autoneg; - u8 duplex = cmd->base.duplex; - u32 speed = cmd->base.speed; - - if (cmd->base.phy_address != phydev->mdio.addr) - return -EINVAL; - - linkmode_copy(advertising, cmd->link_modes.advertising); - - /* We make sure that we don't pass unsupported values in to the PHY */ - linkmode_and(advertising, advertising, phydev->supported); - - /* Verify the settings we care about. */ - if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE) - return -EINVAL; - - if (autoneg == AUTONEG_ENABLE && linkmode_empty(advertising)) - return -EINVAL; - - if (autoneg == AUTONEG_DISABLE && - ((speed != SPEED_1000 && - speed != SPEED_100 && - speed != SPEED_10) || - (duplex != DUPLEX_HALF && - duplex != DUPLEX_FULL))) - return -EINVAL; - - phydev->autoneg = autoneg; - - if (autoneg == AUTONEG_DISABLE) { - phydev->speed = speed; - phydev->duplex = duplex; - } - - linkmode_copy(phydev->advertising, advertising); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, - phydev->advertising, autoneg == AUTONEG_ENABLE); - - phydev->master_slave_set = cmd->base.master_slave_cfg; - phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; - - /* Restart the PHY */ - phy_start_aneg(phydev); - - return 0; -} -EXPORT_SYMBOL(phy_ethtool_ksettings_set); - void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd) { + mutex_lock(&phydev->lock); linkmode_copy(cmd->link_modes.supported, phydev->supported); linkmode_copy(cmd->link_modes.advertising, phydev->advertising); linkmode_copy(cmd->link_modes.lp_advertising, phydev->lp_advertising); @@ -317,6 +265,7 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.autoneg = phydev->autoneg; cmd->base.eth_tp_mdix_ctrl = phydev->mdix_ctrl; cmd->base.eth_tp_mdix = phydev->mdix; + mutex_unlock(&phydev->lock); } EXPORT_SYMBOL(phy_ethtool_ksettings_get); @@ -751,7 +700,7 @@ static int phy_check_link_status(struct phy_device *phydev) } /** - * phy_start_aneg - start auto-negotiation for this PHY device + * _phy_start_aneg - start auto-negotiation for this PHY device * @phydev: the phy_device struct * * Description: Sanitizes the settings (if we're not autonegotiating @@ -759,25 +708,43 @@ static int phy_check_link_status(struct phy_device *phydev) * If the PHYCONTROL Layer is operating, we change the state to * reflect the beginning of Auto-negotiation or forcing. */ -int phy_start_aneg(struct phy_device *phydev) +static int _phy_start_aneg(struct phy_device *phydev) { int err; + lockdep_assert_held(&phydev->lock); + if (!phydev->drv) return -EIO; - mutex_lock(&phydev->lock); - if (AUTONEG_DISABLE == phydev->autoneg) phy_sanitize_settings(phydev); err = phy_config_aneg(phydev); if (err < 0) - goto out_unlock; + return err; if (phy_is_started(phydev)) err = phy_check_link_status(phydev); -out_unlock: + + return err; +} + +/** + * phy_start_aneg - start auto-negotiation for this PHY device + * @phydev: the phy_device struct + * + * Description: Sanitizes the settings (if we're not autonegotiating + * them), and then calls the driver's config_aneg function. + * If the PHYCONTROL Layer is operating, we change the state to + * reflect the beginning of Auto-negotiation or forcing. + */ +int phy_start_aneg(struct phy_device *phydev) +{ + int err; + + mutex_lock(&phydev->lock); + err = _phy_start_aneg(phydev); mutex_unlock(&phydev->lock); return err; @@ -800,6 +767,61 @@ static int phy_poll_aneg_done(struct phy_device *phydev) return ret < 0 ? ret : 0; } +int phy_ethtool_ksettings_set(struct phy_device *phydev, + const struct ethtool_link_ksettings *cmd) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); + u8 autoneg = cmd->base.autoneg; + u8 duplex = cmd->base.duplex; + u32 speed = cmd->base.speed; + + if (cmd->base.phy_address != phydev->mdio.addr) + return -EINVAL; + + linkmode_copy(advertising, cmd->link_modes.advertising); + + /* We make sure that we don't pass unsupported values in to the PHY */ + linkmode_and(advertising, advertising, phydev->supported); + + /* Verify the settings we care about. */ + if (autoneg != AUTONEG_ENABLE && autoneg != AUTONEG_DISABLE) + return -EINVAL; + + if (autoneg == AUTONEG_ENABLE && linkmode_empty(advertising)) + return -EINVAL; + + if (autoneg == AUTONEG_DISABLE && + ((speed != SPEED_1000 && + speed != SPEED_100 && + speed != SPEED_10) || + (duplex != DUPLEX_HALF && + duplex != DUPLEX_FULL))) + return -EINVAL; + + mutex_lock(&phydev->lock); + phydev->autoneg = autoneg; + + if (autoneg == AUTONEG_DISABLE) { + phydev->speed = speed; + phydev->duplex = duplex; + } + + linkmode_copy(phydev->advertising, advertising); + + linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, + phydev->advertising, autoneg == AUTONEG_ENABLE); + + phydev->master_slave_set = cmd->base.master_slave_cfg; + phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; + + /* Restart the PHY */ + _phy_start_aneg(phydev); + + mutex_unlock(&phydev->lock); + return 0; +} +EXPORT_SYMBOL(phy_ethtool_ksettings_set); + /** * phy_speed_down - set speed to lowest speed supported by both link partners * @phydev: the phy_device struct diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index ba5ad86ec826..4f9990b47a37 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3125,6 +3125,9 @@ static void phy_shutdown(struct device *dev) { struct phy_device *phydev = to_phy_device(dev); + if (phydev->state == PHY_READY || !phydev->attached_dev) + return; + phy_disable_interrupts(phydev); } diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 34e90216bd2c..ab77a9f439ef 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -134,7 +134,7 @@ static const char * const sm_state_strings[] = { [SFP_S_LINK_UP] = "link_up", [SFP_S_TX_FAULT] = "tx_fault", [SFP_S_REINIT] = "reinit", - [SFP_S_TX_DISABLE] = "rx_disable", + [SFP_S_TX_DISABLE] = "tx_disable", }; static const char *sm_state_to_str(unsigned short sm_state) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 4c5d69732a7e..b554054a7560 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -99,6 +99,10 @@ config USB_RTL8150 config USB_RTL8152 tristate "Realtek RTL8152/RTL8153 Based USB Ethernet Adapters" select MII + select CRC32 + select CRYPTO + select CRYPTO_HASH + select CRYPTO_SHA256 help This option adds support for Realtek RTL8152 based USB 2.0 10/100 Ethernet adapters and RTL8153 based USB 3.0 10/100/1000 @@ -113,6 +117,7 @@ config USB_LAN78XX select PHYLIB select MICROCHIP_PHY select FIXED_PHY + select CRC32 help This option adds support for Microchip LAN78XX based USB 2 & USB 3 10/100/1000 Ethernet adapters. diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 793f8fbe0069..63cd72c5f580 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -4122,6 +4122,12 @@ static int lan78xx_probe(struct usb_interface *intf, dev->maxpacket = usb_maxpacket(dev->udev, dev->pipe_out, 1); + /* Reject broken descriptors. */ + if (dev->maxpacket == 0) { + ret = -ENODEV; + goto out4; + } + /* driver requires remote-wakeup capability during autosuspend. */ intf->needs_remote_wakeup = 1; diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 60ba9b734055..f329e39100a7 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -767,6 +767,7 @@ enum rtl8152_flags { PHY_RESET, SCHEDULE_TASKLET, GREEN_ETHERNET, + RX_EPROTO, }; #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2 0x3082 @@ -1770,6 +1771,14 @@ static void read_bulk_callback(struct urb *urb) rtl_set_unplug(tp); netif_device_detach(tp->netdev); return; + case -EPROTO: + urb->actual_length = 0; + spin_lock_irqsave(&tp->rx_lock, flags); + list_add_tail(&agg->list, &tp->rx_done); + spin_unlock_irqrestore(&tp->rx_lock, flags); + set_bit(RX_EPROTO, &tp->flags); + schedule_delayed_work(&tp->schedule, 1); + return; case -ENOENT: return; /* the urb is in unlink state */ case -ETIME: @@ -2425,6 +2434,7 @@ static int rx_bottom(struct r8152 *tp, int budget) if (list_empty(&tp->rx_done)) goto out1; + clear_bit(RX_EPROTO, &tp->flags); INIT_LIST_HEAD(&rx_queue); spin_lock_irqsave(&tp->rx_lock, flags); list_splice_init(&tp->rx_done, &rx_queue); @@ -2441,7 +2451,7 @@ static int rx_bottom(struct r8152 *tp, int budget) agg = list_entry(cursor, struct rx_agg, list); urb = agg->urb; - if (urb->actual_length < ETH_ZLEN) + if (urb->status != 0 || urb->actual_length < ETH_ZLEN) goto submit; agg_free = rtl_get_free_rx(tp, GFP_ATOMIC); @@ -6643,6 +6653,10 @@ static void rtl_work_func_t(struct work_struct *work) netif_carrier_ok(tp->netdev)) tasklet_schedule(&tp->tx_tl); + if (test_and_clear_bit(RX_EPROTO, &tp->flags) && + !list_empty(&tp->rx_done)) + napi_schedule(&tp->napi); + mutex_unlock(&tp->control); out1: diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 840c1c2ab16a..a33d7fb82a00 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1788,6 +1788,11 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) if (!dev->rx_urb_size) dev->rx_urb_size = dev->hard_mtu; dev->maxpacket = usb_maxpacket (dev->udev, dev->out, 1); + if (dev->maxpacket == 0) { + /* that is a broken device */ + status = -ENODEV; + goto out4; + } /* let userspace know we have a random address */ if (ether_addr_equal(net->dev_addr, node_id)) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 79bd2585ec6b..4ad25a8b0870 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -406,7 +406,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, * add_recvbuf_mergeable() + get_mergeable_buf_len() */ truesize = headroom ? PAGE_SIZE : truesize; - tailroom = truesize - len - headroom; + tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len); buf = p - headroom; len -= hdr_len; diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 142f70670f5c..8799854bacb2 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -3833,7 +3833,6 @@ vmxnet3_suspend(struct device *device) vmxnet3_free_intr_resources(adapter); netif_device_detach(netdev); - netif_tx_stop_all_queues(netdev); /* Create wake-up filters. */ pmConf = adapter->pm_conf; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index bf2fac913942..662e26117353 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1360,8 +1360,6 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); - nf_reset_ct(skb); - /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For strict packets with a source LLA, determine the dst using the @@ -1424,8 +1422,6 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; - nf_reset_ct(skb); - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig index 741289e385d5..ca007b800f75 100644 --- a/drivers/net/wireless/ath/ath10k/Kconfig +++ b/drivers/net/wireless/ath/ath10k/Kconfig @@ -44,7 +44,7 @@ config ATH10K_SNOC tristate "Qualcomm ath10k SNOC support" depends on ATH10K depends on ARCH_QCOM || COMPILE_TEST - depends on QCOM_SCM || !QCOM_SCM #if QCOM_SCM=m this can't be =y + select QCOM_SCM select QCOM_QMI_HELPERS help This module adds support for integrated WCN3990 chip connected diff --git a/drivers/net/wireless/ath/ath5k/Kconfig b/drivers/net/wireless/ath/ath5k/Kconfig index f35cd8de228e..6914b37bb0fb 100644 --- a/drivers/net/wireless/ath/ath5k/Kconfig +++ b/drivers/net/wireless/ath/ath5k/Kconfig @@ -3,9 +3,7 @@ config ATH5K tristate "Atheros 5xxx wireless cards support" depends on (PCI || ATH25) && MAC80211 select ATH_COMMON - select MAC80211_LEDS - select LEDS_CLASS - select NEW_LEDS + select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211 select ATH5K_AHB if ATH25 select ATH5K_PCI if !ATH25 help diff --git a/drivers/net/wireless/ath/ath5k/led.c b/drivers/net/wireless/ath/ath5k/led.c index 6a2a16856763..33e9928af363 100644 --- a/drivers/net/wireless/ath/ath5k/led.c +++ b/drivers/net/wireless/ath/ath5k/led.c @@ -89,7 +89,8 @@ static const struct pci_device_id ath5k_led_devices[] = { void ath5k_led_enable(struct ath5k_hw *ah) { - if (test_bit(ATH_STAT_LEDSOFT, ah->status)) { + if (IS_ENABLED(CONFIG_MAC80211_LEDS) && + test_bit(ATH_STAT_LEDSOFT, ah->status)) { ath5k_hw_set_gpio_output(ah, ah->led_pin); ath5k_led_off(ah); } @@ -104,7 +105,8 @@ static void ath5k_led_on(struct ath5k_hw *ah) void ath5k_led_off(struct ath5k_hw *ah) { - if (!test_bit(ATH_STAT_LEDSOFT, ah->status)) + if (!IS_ENABLED(CONFIG_MAC80211_LEDS) || + !test_bit(ATH_STAT_LEDSOFT, ah->status)) return; ath5k_hw_set_gpio(ah, ah->led_pin, !ah->led_on); } @@ -146,7 +148,7 @@ ath5k_register_led(struct ath5k_hw *ah, struct ath5k_led *led, static void ath5k_unregister_led(struct ath5k_led *led) { - if (!led->ah) + if (!IS_ENABLED(CONFIG_MAC80211_LEDS) || !led->ah) return; led_classdev_unregister(&led->led_dev); ath5k_led_off(led->ah); @@ -169,7 +171,7 @@ int ath5k_init_leds(struct ath5k_hw *ah) char name[ATH5K_LED_MAX_NAME_LEN + 1]; const struct pci_device_id *match; - if (!ah->pdev) + if (!IS_ENABLED(CONFIG_MAC80211_LEDS) || !ah->pdev) return 0; #ifdef CONFIG_ATH5K_AHB diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index f7b96cd69242..9db12ffd2ff8 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -7463,23 +7463,18 @@ static s32 brcmf_translate_country_code(struct brcmf_pub *drvr, char alpha2[2], s32 found_index; int i; + country_codes = drvr->settings->country_codes; + if (!country_codes) { + brcmf_dbg(TRACE, "No country codes configured for device\n"); + return -EINVAL; + } + if ((alpha2[0] == ccreq->country_abbrev[0]) && (alpha2[1] == ccreq->country_abbrev[1])) { brcmf_dbg(TRACE, "Country code already set\n"); return -EAGAIN; } - country_codes = drvr->settings->country_codes; - if (!country_codes) { - brcmf_dbg(TRACE, "No country codes configured for device, using ISO3166 code and 0 rev\n"); - memset(ccreq, 0, sizeof(*ccreq)); - ccreq->country_abbrev[0] = alpha2[0]; - ccreq->country_abbrev[1] = alpha2[1]; - ccreq->ccode[0] = alpha2[0]; - ccreq->ccode[1] = alpha2[1]; - return 0; - } - found_index = -1; for (i = 0; i < country_codes->table_size; i++) { cc = &country_codes->table[i]; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 0e97d5e6c644..9f706fffb592 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -160,6 +160,7 @@ static void iwl_mvm_wowlan_program_keys(struct ieee80211_hw *hw, mvm->ptk_icvlen = key->icv_len; mvm->gtk_ivlen = key->iv_len; mvm->gtk_icvlen = key->icv_len; + mutex_unlock(&mvm->mutex); /* don't upload key again */ return; @@ -360,11 +361,11 @@ static void iwl_mvm_wowlan_get_rsc_v5_data(struct ieee80211_hw *hw, if (sta) { rsc = data->rsc->ucast_rsc; } else { - if (WARN_ON(data->gtks > ARRAY_SIZE(data->gtk_ids))) + if (WARN_ON(data->gtks >= ARRAY_SIZE(data->gtk_ids))) return; data->gtk_ids[data->gtks] = key->keyidx; rsc = data->rsc->mcast_rsc[data->gtks % 2]; - if (WARN_ON(key->keyidx > + if (WARN_ON(key->keyidx >= ARRAY_SIZE(data->rsc->mcast_key_id_map))) return; data->rsc->mcast_key_id_map[key->keyidx] = data->gtks % 2; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 25af88a3edce..e91f8e889df7 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -662,12 +662,13 @@ static bool __iwl_mvm_remove_time_event(struct iwl_mvm *mvm, u32 *uid) { u32 id; - struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(te_data->vif); + struct iwl_mvm_vif *mvmvif; enum nl80211_iftype iftype; if (!te_data->vif) return false; + mvmvif = iwl_mvm_vif_from_mac80211(te_data->vif); iftype = te_data->vif->type; /* diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 61b2797a34a8..e3996ff99bad 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -547,6 +547,8 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0x43F0, 0x0074, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x43F0, 0x0078, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x43F0, 0x007C, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x43F0, 0x1651, killer1650s_2ax_cfg_qu_b0_hr_b0, iwl_ax201_killer_1650s_name), + IWL_DEV_INFO(0x43F0, 0x1652, killer1650i_2ax_cfg_qu_b0_hr_b0, iwl_ax201_killer_1650i_name), IWL_DEV_INFO(0x43F0, 0x2074, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x43F0, 0x4070, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0xA0F0, 0x0070, iwl_ax201_cfg_qu_hr, NULL), diff --git a/drivers/net/wireless/marvell/mwifiex/sta_tx.c b/drivers/net/wireless/marvell/mwifiex/sta_tx.c index 241305377e20..a9b5eb992220 100644 --- a/drivers/net/wireless/marvell/mwifiex/sta_tx.c +++ b/drivers/net/wireless/marvell/mwifiex/sta_tx.c @@ -62,8 +62,8 @@ void *mwifiex_process_sta_txpd(struct mwifiex_private *priv, pkt_type = mwifiex_is_skb_mgmt_frame(skb) ? PKT_TYPE_MGMT : 0; - pad = ((void *)skb->data - (sizeof(*local_tx_pd) + hroom)- - NULL) & (MWIFIEX_DMA_ALIGN_SZ - 1); + pad = ((uintptr_t)skb->data - (sizeof(*local_tx_pd) + hroom)) & + (MWIFIEX_DMA_ALIGN_SZ - 1); skb_push(skb, sizeof(*local_tx_pd) + pad); local_tx_pd = (struct txpd *) skb->data; diff --git a/drivers/net/wireless/marvell/mwifiex/uap_txrx.c b/drivers/net/wireless/marvell/mwifiex/uap_txrx.c index 9bbdb8dfce62..245ff644f81e 100644 --- a/drivers/net/wireless/marvell/mwifiex/uap_txrx.c +++ b/drivers/net/wireless/marvell/mwifiex/uap_txrx.c @@ -475,8 +475,8 @@ void *mwifiex_process_uap_txpd(struct mwifiex_private *priv, pkt_type = mwifiex_is_skb_mgmt_frame(skb) ? PKT_TYPE_MGMT : 0; - pad = ((void *)skb->data - (sizeof(*txpd) + hroom) - NULL) & - (MWIFIEX_DMA_ALIGN_SZ - 1); + pad = ((uintptr_t)skb->data - (sizeof(*txpd) + hroom)) & + (MWIFIEX_DMA_ALIGN_SZ - 1); skb_push(skb, sizeof(*txpd) + pad); diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index e31b98403f31..fc41ba95f81d 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1730,6 +1730,10 @@ static int netfront_resume(struct xenbus_device *dev) dev_dbg(&dev->dev, "%s\n", dev->nodename); + netif_tx_lock_bh(info->netdev); + netif_device_detach(info->netdev); + netif_tx_unlock_bh(info->netdev); + xennet_disconnect_backend(info); return 0; } @@ -2349,6 +2353,10 @@ static int xennet_connect(struct net_device *dev) * domain a kick because we've probably just requeued some * packets. */ + netif_tx_lock_bh(np->netdev); + netif_device_attach(np->netdev); + netif_tx_unlock_bh(np->netdev); + netif_carrier_on(np->netdev); for (j = 0; j < num_queues; ++j) { queue = &np->queues[j]; diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c index 517376c43b86..16ceb763594f 100644 --- a/drivers/nfc/port100.c +++ b/drivers/nfc/port100.c @@ -1006,11 +1006,11 @@ static u64 port100_get_command_type_mask(struct port100 *dev) skb = port100_alloc_skb(dev, 0); if (!skb) - return -ENOMEM; + return 0; resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_COMMAND_TYPE, skb); if (IS_ERR(resp)) - return PTR_ERR(resp); + return 0; if (resp->len < 8) mask = 0; diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c index d16cf3ff644e..b23f47936473 100644 --- a/drivers/nfc/st95hf/core.c +++ b/drivers/nfc/st95hf/core.c @@ -1226,11 +1226,9 @@ static int st95hf_remove(struct spi_device *nfc_spi_dev) &reset_cmd, ST95HF_RESET_CMD_LEN, ASYNC); - if (result) { + if (result) dev_err(&spictx->spidev->dev, "ST95HF reset failed in remove() err = %d\n", result); - return result; - } /* wait for 3 ms to complete the controller reset process */ usleep_range(3000, 4000); @@ -1239,7 +1237,7 @@ static int st95hf_remove(struct spi_device *nfc_spi_dev) if (stcontext->st95hf_supply) regulator_disable(stcontext->st95hf_supply); - return result; + return 0; } /* Register as SPI protocol driver */ diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 088d3dd6f6fa..b6c6866f9259 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -162,7 +162,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk, return err; } -static blk_qc_t nd_blk_submit_bio(struct bio *bio) +static void nd_blk_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip; struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data; @@ -173,7 +173,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio) bool do_acct; if (!bio_integrity_prep(bio)) - return BLK_QC_T_NONE; + return; bip = bio_integrity(bio); rw = bio_data_dir(bio); @@ -199,7 +199,6 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio) bio_end_io_acct(bio, start); bio_endio(bio); - return BLK_QC_T_NONE; } static int nsblk_rw_bytes(struct nd_namespace_common *ndns, diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 92dec4952297..4295fa809420 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1440,7 +1440,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, return ret; } -static blk_qc_t btt_submit_bio(struct bio *bio) +static void btt_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct btt *btt = bio->bi_bdev->bd_disk->private_data; @@ -1451,7 +1451,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio) bool do_acct; if (!bio_integrity_prep(bio)) - return BLK_QC_T_NONE; + return; do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) @@ -1483,7 +1483,6 @@ static blk_qc_t btt_submit_bio(struct bio *bio) bio_end_io_acct(bio, start); bio_endio(bio); - return BLK_QC_T_NONE; } static int btt_rw_page(struct block_device *bdev, sector_t sector, diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 7de592d7eff4..6a45fa91e8a3 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/module.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/device.h> #include <linux/ctype.h> #include <linux/ndctl.h> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ef4950f80832..c74d7bceb222 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -190,7 +190,7 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem, return rc; } -static blk_qc_t pmem_submit_bio(struct bio *bio) +static void pmem_submit_bio(struct bio *bio) { int ret = 0; blk_status_t rc = 0; @@ -229,7 +229,6 @@ static blk_qc_t pmem_submit_bio(struct bio *bio) bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); - return BLK_QC_T_NONE; } static int pmem_rw_page(struct block_device *bdev, sector_t sector, @@ -333,26 +332,6 @@ static const struct attribute_group *pmem_attribute_groups[] = { NULL, }; -static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - struct pmem_device *pmem = pgmap->owner; - - blk_cleanup_disk(pmem->disk); -} - -static void pmem_release_queue(void *pgmap) -{ - pmem_pagemap_cleanup(pgmap); -} - -static void pmem_pagemap_kill(struct dev_pagemap *pgmap) -{ - struct request_queue *q = - container_of(pgmap->ref, struct request_queue, q_usage_counter); - - blk_freeze_queue_start(q); -} - static void pmem_release_disk(void *__pmem) { struct pmem_device *pmem = __pmem; @@ -360,12 +339,9 @@ static void pmem_release_disk(void *__pmem) kill_dax(pmem->dax_dev); put_dax(pmem->dax_dev); del_gendisk(pmem->disk); -} -static const struct dev_pagemap_ops fsdax_pagemap_ops = { - .kill = pmem_pagemap_kill, - .cleanup = pmem_pagemap_cleanup, -}; + blk_cleanup_disk(pmem->disk); +} static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) @@ -427,10 +403,8 @@ static int pmem_attach_disk(struct device *dev, pmem->disk = disk; pmem->pgmap.owner = pmem; pmem->pfn_flags = PFN_DEV; - pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; - pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -444,16 +418,12 @@ static int pmem_attach_disk(struct device *dev, pmem->pgmap.range.end = res->end; pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; - pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; bb_range = pmem->pgmap.range; } else { addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); - if (devm_add_action_or_reset(dev, pmem_release_queue, - &pmem->pgmap)) - return -ENOMEM; bb_range.start = res->start; bb_range.end = res->end; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7712a8f78337..838b5e2058be 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -6,6 +6,7 @@ #include <linux/blkdev.h> #include <linux/blk-mq.h> +#include <linux/blk-integrity.h> #include <linux/compat.h> #include <linux/delay.h> #include <linux/errno.h> @@ -118,25 +119,6 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, struct nvme_command *cmd); -/* - * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. - */ -static void nvme_set_queue_dying(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - return; - - blk_set_queue_dying(ns->queue); - blk_mq_unquiesce_queue(ns->queue); - - set_capacity_and_notify(ns->disk, 0); -} - void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* @@ -221,7 +203,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) { dev_info(ctrl->device, - "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); + "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl)); flush_work(&ctrl->reset_work); nvme_stop_ctrl(ctrl); @@ -345,15 +327,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) return RETRY; } -static inline void nvme_end_req(struct request *req) +static inline void nvme_end_req_zoned(struct request *req) { - blk_status_t status = nvme_error_status(nvme_req(req)->status); - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); +} +static inline void nvme_end_req(struct request *req) +{ + blk_status_t status = nvme_error_status(nvme_req(req)->status); + + nvme_end_req_zoned(req); nvme_trace_bio_complete(req); blk_mq_end_request(req, status); } @@ -380,6 +366,13 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); +void nvme_complete_batch_req(struct request *req) +{ + nvme_cleanup_cmd(req); + nvme_end_req_zoned(req); +} +EXPORT_SYMBOL_GPL(nvme_complete_batch_req); + /* * Called to unwind from ->queue_rq on a failed command submission so that the * multipathing code gets called to potentially failover to another path. @@ -631,7 +624,7 @@ static inline void nvme_init_request(struct request *req, req->cmd_flags |= REQ_FAILFAST_DRIVER; if (req->mq_hctx->type == HCTX_TYPE_POLL) - req->cmd_flags |= REQ_HIPRI; + req->cmd_flags |= REQ_POLLED; nvme_clear_nvme_request(req); memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); } @@ -822,6 +815,7 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { + memset(cmnd, 0, sizeof(*cmnd)); cmnd->common.opcode = nvme_cmd_flush; cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } @@ -873,6 +867,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, return BLK_STS_IOERR; } + memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); cmnd->dsm.nr = cpu_to_le32(segments - 1); @@ -889,6 +884,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { + memset(cmnd, 0, sizeof(*cmnd)); + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) return nvme_setup_discard(ns, req, cmnd); @@ -922,9 +919,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; cmnd->rw.opcode = op; + cmnd->rw.flags = 0; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); + cmnd->rw.rsvd2 = 0; + cmnd->rw.metadata = 0; cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + cmnd->rw.reftag = 0; + cmnd->rw.apptag = 0; + cmnd->rw.appmask = 0; if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); @@ -981,10 +984,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; blk_status_t ret = BLK_STS_OK; - if (!(req->rq_flags & RQF_DONTPREP)) { + if (!(req->rq_flags & RQF_DONTPREP)) nvme_clear_nvme_request(req); - memset(cmd, 0, sizeof(*cmd)); - } switch (req_op(req)) { case REQ_OP_DRV_IN: @@ -2600,6 +2601,24 @@ static ssize_t nvme_subsys_show_nqn(struct device *dev, } static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); +static ssize_t nvme_subsys_show_type(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + switch (subsys->subtype) { + case NVME_NQN_DISC: + return sysfs_emit(buf, "discovery\n"); + case NVME_NQN_NVME: + return sysfs_emit(buf, "nvm\n"); + default: + return sysfs_emit(buf, "reserved\n"); + } +} +static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type); + #define nvme_subsys_show_str_function(field) \ static ssize_t subsys_##field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -2620,6 +2639,7 @@ static struct attribute *nvme_subsys_attrs[] = { &subsys_attr_serial.attr, &subsys_attr_firmware_rev.attr, &subsys_attr_subsysnqn.attr, + &subsys_attr_subsystype.attr, #ifdef CONFIG_NVME_MULTIPATH &subsys_attr_iopolicy.attr, #endif @@ -2690,6 +2710,21 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; + + /* Versions prior to 1.4 don't necessarily report a valid type */ + if (id->cntrltype == NVME_CTRL_DISC || + !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME)) + subsys->subtype = NVME_NQN_DISC; + else + subsys->subtype = NVME_NQN_NVME; + + if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) { + dev_err(ctrl->device, + "Subsystem %s is not a discovery controller", + subsys->subnqn); + kfree(subsys); + return -EINVAL; + } subsys->awupf = le16_to_cpu(id->awupf); #ifdef CONFIG_NVME_MULTIPATH subsys->iopolicy = NVME_IOPOLICY_NUMA; @@ -3550,10 +3585,15 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, return 0; } +static void nvme_cdev_rel(struct device *dev) +{ + ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(dev->devt)); +} + void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) { cdev_device_del(cdev, cdev_device); - ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(cdev_device->devt)); + put_device(cdev_device); } int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, @@ -3566,14 +3606,14 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, return minor; cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); cdev_device->class = nvme_ns_chr_class; + cdev_device->release = nvme_cdev_rel; device_initialize(cdev_device); cdev_init(cdev, fops); cdev->owner = owner; ret = cdev_device_add(cdev, cdev_device); - if (ret) { + if (ret) put_device(cdev_device); - ida_simple_remove(&nvme_ns_chr_minor_ida, minor); - } + return ret; } @@ -3605,11 +3645,9 @@ static int nvme_add_ns_cdev(struct nvme_ns *ns) ns->ctrl->instance, ns->head->instance); if (ret) return ret; - ret = nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, - ns->ctrl->ops->module); - if (ret) - kfree_const(ns->cdev_device.kobj.name); - return ret; + + return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, + ns->ctrl->ops->module); } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, @@ -4470,6 +4508,37 @@ out: } EXPORT_SYMBOL_GPL(nvme_init_ctrl); +static void nvme_start_ns_queue(struct nvme_ns *ns) +{ + if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_unquiesce_queue(ns->queue); +} + +static void nvme_stop_ns_queue(struct nvme_ns *ns) +{ + if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_quiesce_queue(ns->queue); +} + +/* + * Prepare a queue for teardown. + * + * This must forcibly unquiesce queues to avoid blocking dispatch, and only set + * the capacity to 0 after that to avoid blocking dispatchers that may be + * holding bd_butex. This will end buffered writers dirtying pages that can't + * be synced. + */ +static void nvme_set_queue_dying(struct nvme_ns *ns) +{ + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + return; + + blk_set_queue_dying(ns->queue); + nvme_start_ns_queue(ns); + + set_capacity_and_notify(ns->disk, 0); +} + /** * nvme_kill_queues(): Ends all namespace queues * @ctrl: the dead controller that needs to end @@ -4485,7 +4554,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) /* Forcibly unquiesce queues to avoid blocking dispatch */ if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); list_for_each_entry(ns, &ctrl->namespaces, list) nvme_set_queue_dying(ns); @@ -4548,7 +4617,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_quiesce_queue(ns->queue); + nvme_stop_ns_queue(ns); up_read(&ctrl->namespaces_rwsem); } EXPORT_SYMBOL_GPL(nvme_stop_queues); @@ -4559,11 +4628,25 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_unquiesce_queue(ns->queue); + nvme_start_ns_queue(ns); up_read(&ctrl->namespaces_rwsem); } EXPORT_SYMBOL_GPL(nvme_start_queues); +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +{ + if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_quiesce_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); + +void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +{ + if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_unquiesce_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_start_admin_queue); + void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 668c6bb7a567..c5a2b71c5268 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -548,6 +548,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, { NVMF_OPT_TOS, "tos=%d" }, { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, + { NVMF_OPT_DISCOVERY, "discovery" }, { NVMF_OPT_ERR, NULL } }; @@ -823,6 +824,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->tos = token; break; + case NVMF_OPT_DISCOVERY: + opts->discovery_nqn = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -949,7 +953,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ - NVMF_OPT_DISABLE_SQFLOW |\ + NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\ NVMF_OPT_FAIL_FAST_TMO) static struct nvme_ctrl * diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a146cb903869..c3203ff1c654 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -67,6 +67,7 @@ enum { NVMF_OPT_TOS = 1 << 19, NVMF_OPT_FAIL_FAST_TMO = 1 << 20, NVMF_OPT_HOST_IFACE = 1 << 21, + NVMF_OPT_DISCOVERY = 1 << 22, }; /** @@ -178,6 +179,13 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, return true; } +static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl) +{ + if (!ctrl->subsys) + return ctrl->opts->subsysnqn; + return ctrl->subsys->subnqn; +} + int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index aa14ad963d91..71b3108c22f0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -16,6 +16,7 @@ #include <linux/nvme-fc.h> #include "fc.h" #include <scsi/scsi_transport_fc.h> +#include <linux/blk-mq-pci.h> /* *************************** Data Structures/Defines ****************** */ @@ -2382,7 +2383,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); @@ -2510,7 +2511,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) /* * clean up the admin queue. Same thing as above. */ - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2841,6 +2842,28 @@ nvme_fc_complete_rq(struct request *rq) nvme_fc_ctrl_put(ctrl); } +static int nvme_fc_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_fc_ctrl *ctrl = set->driver_data; + int i; + + for (i = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + if (!map->nr_queues) { + WARN_ON(i == HCTX_TYPE_DEFAULT); + continue; + } + + /* Call LLDD map queue functionality if defined */ + if (ctrl->lport->ops->map_queues) + ctrl->lport->ops->map_queues(&ctrl->lport->localport, + map); + else + blk_mq_map_queues(map); + } + return 0; +} static const struct blk_mq_ops nvme_fc_mq_ops = { .queue_rq = nvme_fc_queue_rq, @@ -2849,6 +2872,7 @@ static const struct blk_mq_ops nvme_fc_mq_ops = { .exit_request = nvme_fc_exit_request, .init_hctx = nvme_fc_init_hctx, .timeout = nvme_fc_timeout, + .map_queues = nvme_fc_map_queues, }; static int @@ -3095,7 +3119,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << (ilog2(SZ_4K) - 9); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); ret = nvme_init_ctrl_finish(&ctrl->ctrl); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) @@ -3249,7 +3273,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_fc_free_queue(&ctrl->queues[0]); /* re-enable the admin_q so anything new can fast fail */ - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); /* resume the io queues so that things will fast fail */ nvme_start_queues(&ctrl->ctrl); @@ -3572,7 +3596,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", - ctrl->cnum, ctrl->ctrl.opts->subsysnqn); + ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl)); return &ctrl->ctrl; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e8ccdd398f78..7f2071f2460c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -85,8 +85,13 @@ void nvme_failover_req(struct request *req) } spin_lock_irqsave(&ns->head->requeue_lock, flags); - for (bio = req->bio; bio; bio = bio->bi_next) + for (bio = req->bio; bio; bio = bio->bi_next) { bio_set_dev(bio, ns->head->disk->part0); + if (bio->bi_opf & REQ_POLLED) { + bio->bi_opf &= ~REQ_POLLED; + bio->bi_cookie = BLK_QC_T_NONE; + } + } blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); @@ -100,8 +105,11 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->head->disk) - kblockd_schedule_work(&ns->head->requeue_work); + if (!ns->head->disk) + continue; + kblockd_schedule_work(&ns->head->requeue_work); + if (ctrl->state == NVME_CTRL_LIVE) + disk_uevent(ns->head->disk, KOBJ_CHANGE); } up_read(&ctrl->namespaces_rwsem); } @@ -138,13 +146,12 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; - mutex_lock(&ctrl->scan_lock); down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (nvme_mpath_clear_current_path(ns)) - kblockd_schedule_work(&ns->head->requeue_work); + list_for_each_entry(ns, &ctrl->namespaces, list) { + nvme_mpath_clear_current_path(ns); + kblockd_schedule_work(&ns->head->requeue_work); + } up_read(&ctrl->namespaces_rwsem); - mutex_unlock(&ctrl->scan_lock); } void nvme_mpath_revalidate_paths(struct nvme_ns *ns) @@ -312,12 +319,11 @@ static bool nvme_available_path(struct nvme_ns_head *head) return false; } -static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) +static void nvme_ns_head_submit_bio(struct bio *bio) { struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); struct nvme_ns *ns; - blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; /* @@ -334,7 +340,7 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) bio->bi_opf |= REQ_NVME_MPATH; trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); - ret = submit_bio_noacct(bio); + submit_bio_noacct(bio); } else if (nvme_available_path(head)) { dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); @@ -349,7 +355,6 @@ static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) } srcu_read_unlock(&head->srcu, srcu_idx); - return ret; } static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) @@ -431,8 +436,6 @@ static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) return ret; ret = nvme_cdev_add(&head->cdev, &head->cdev_device, &nvme_ns_head_chr_fops, THIS_MODULE); - if (ret) - kfree_const(head->cdev_device.kobj.name); return ret; } @@ -481,6 +484,15 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + /* + * This assumes all controllers that refer to a namespace either + * support poll queues or not. That is not a strict guarantee, + * but if the assumption is wrong the effect is only suboptimal + * performance but not correctness problem. + */ + if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && + ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); /* set to a default value of 512 until the disk is validated */ blk_queue_logical_block_size(head->disk->queue, 512); @@ -496,13 +508,23 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) static void nvme_mpath_set_live(struct nvme_ns *ns) { struct nvme_ns_head *head = ns->head; + int rc; if (!head->disk) return; + /* + * test_and_set_bit() is used because it is protecting against two nvme + * paths simultaneously calling device_add_disk() on the same namespace + * head. + */ if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { - device_add_disk(&head->subsys->dev, head->disk, - nvme_ns_id_attr_groups); + rc = device_add_disk(&head->subsys->dev, head->disk, + nvme_ns_id_attr_groups); + if (rc) { + clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags); + return; + } nvme_add_ns_head_cdev(head); } @@ -540,7 +562,7 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, return -EINVAL; nr_nsids = le32_to_cpu(desc->nnsids); - nsid_buf_size = nr_nsids * sizeof(__le32); + nsid_buf_size = flex_array_size(desc, nsids, nr_nsids); if (WARN_ON_ONCE(desc->grpid == 0)) return -EINVAL; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ed79a6c7e804..b334af8aa264 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -342,6 +342,7 @@ struct nvme_ctrl { int nr_reconnects; unsigned long flags; #define NVME_CTRL_FAILFAST_EXPIRED 0 +#define NVME_CTRL_ADMIN_Q_STOPPED 1 struct nvmf_ctrl_options *opts; struct page *discard_page; @@ -372,6 +373,7 @@ struct nvme_subsystem { char model[40]; char firmware_rev[8]; u8 cmic; + enum nvme_subsys_type subtype; u16 vendor_id; u16 awupf; /* 0's based awupf value. */ struct ida ns_ida; @@ -463,6 +465,7 @@ struct nvme_ns { #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 #define NVME_NS_READY 4 +#define NVME_NS_STOPPED 5 struct cdev cdev; struct device cdev_device; @@ -638,6 +641,20 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) } void nvme_complete_rq(struct request *req); +void nvme_complete_batch_req(struct request *req); + +static __always_inline void nvme_complete_batch(struct io_comp_batch *iob, + void (*fn)(struct request *rq)) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + fn(req); + nvme_complete_batch_req(req); + } + blk_mq_end_request_batch(iob); +} + blk_status_t nvme_host_path_error(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); void nvme_cancel_tagset(struct nvme_ctrl *ctrl); @@ -665,6 +682,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); +void nvme_start_admin_queue(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 456a0e8a5718..ca2ee806d74b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -10,6 +10,7 @@ #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> +#include <linux/blk-integrity.h> #include <linux/dmi.h> #include <linux/init.h> #include <linux/interrupt.h> @@ -244,8 +245,15 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) { unsigned int mem_size = nvme_dbbuf_size(dev); - if (dev->dbbuf_dbs) + if (dev->dbbuf_dbs) { + /* + * Clear the dbbuf memory so the driver doesn't observe stale + * values from the previous instantiation. + */ + memset(dev->dbbuf_dbs, 0, mem_size); + memset(dev->dbbuf_eis, 0, mem_size); return 0; + } dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_dbs_dma_addr, @@ -958,7 +966,7 @@ out_free_cmd: return ret; } -static void nvme_pci_complete_rq(struct request *req) +static __always_inline void nvme_pci_unmap_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_dev *dev = iod->nvmeq->dev; @@ -968,9 +976,19 @@ static void nvme_pci_complete_rq(struct request *req) rq_integrity_vec(req)->bv_len, rq_data_dir(req)); if (blk_rq_nr_phys_segments(req)) nvme_unmap_data(dev, req); +} + +static void nvme_pci_complete_rq(struct request *req) +{ + nvme_pci_unmap_rq(req); nvme_complete_rq(req); } +static void nvme_pci_complete_batch(struct io_comp_batch *iob) +{ + nvme_complete_batch(iob, nvme_pci_unmap_rq); +} + /* We read the CQE phase first to check if the rest of the entry is valid */ static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) { @@ -995,7 +1013,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; } -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct io_comp_batch *iob, u16 idx) { struct nvme_completion *cqe = &nvmeq->cqes[idx]; __u16 command_id = READ_ONCE(cqe->command_id); @@ -1022,7 +1041,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_try_complete_req(req, cqe->status, cqe->result)) + if (!nvme_try_complete_req(req, cqe->status, cqe->result) && + !blk_mq_add_to_batch(req, iob, nvme_req(req)->status, + nvme_pci_complete_batch)) nvme_pci_complete_rq(req); } @@ -1038,7 +1059,8 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline int nvme_process_cq(struct nvme_queue *nvmeq) +static inline int nvme_poll_cq(struct nvme_queue *nvmeq, + struct io_comp_batch *iob) { int found = 0; @@ -1049,7 +1071,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) * the cqe requires a full read memory barrier */ dma_rmb(); - nvme_handle_cqe(nvmeq, nvmeq->cq_head); + nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } @@ -1061,9 +1083,13 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) static irqreturn_t nvme_irq(int irq, void *data) { struct nvme_queue *nvmeq = data; + DEFINE_IO_COMP_BATCH(iob); - if (nvme_process_cq(nvmeq)) + if (nvme_poll_cq(nvmeq, &iob)) { + if (!rq_list_empty(iob.req_list)) + nvme_pci_complete_batch(&iob); return IRQ_HANDLED; + } return IRQ_NONE; } @@ -1087,11 +1113,11 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - nvme_process_cq(nvmeq); + nvme_poll_cq(nvmeq, NULL); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } -static int nvme_poll(struct blk_mq_hw_ctx *hctx) +static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_queue *nvmeq = hctx->driver_data; bool found; @@ -1100,7 +1126,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq); + found = nvme_poll_cq(nvmeq, iob); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1273,7 +1299,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * Did we miss an interrupt? */ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) - nvme_poll(req->mq_hctx); + nvme_poll(req->mq_hctx, NULL); else nvme_poll_irqdisable(nvmeq); @@ -1330,7 +1356,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) iod->aborted = 1; cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = req->tag; + cmd.abort.cid = nvme_cid(req); cmd.abort.sqid = cpu_to_le16(nvmeq->qid); dev_warn(nvmeq->dev->ctrl.device, @@ -1395,7 +1421,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); + nvme_stop_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); return 0; @@ -1433,7 +1459,7 @@ static void nvme_reap_pending_cqes(struct nvme_dev *dev) for (i = dev->ctrl.queue_count - 1; i > 0; i--) { spin_lock(&dev->queues[i].cq_poll_lock); - nvme_process_cq(&dev->queues[i]); + nvme_poll_cq(&dev->queues[i], NULL); spin_unlock(&dev->queues[i].cq_poll_lock); } } @@ -1654,7 +1680,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1688,7 +1714,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return -ENODEV; } } else - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); return 0; } @@ -2623,7 +2649,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (shutdown) { nvme_start_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - blk_mq_unquiesce_queue(dev->ctrl.admin_q); + nvme_start_admin_queue(&dev->ctrl); } mutex_unlock(&dev->shutdown_lock); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 042c594bc57e..850f84d204d0 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -13,6 +13,7 @@ #include <linux/atomic.h> #include <linux/blk-mq.h> #include <linux/blk-mq-rdma.h> +#include <linux/blk-integrity.h> #include <linux/types.h> #include <linux/list.h> #include <linux/mutex.h> @@ -918,7 +919,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, else ctrl->ctrl.max_integrity_segments = 0; - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) @@ -927,7 +928,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return 0; out_quiesce_queue: - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); @@ -1025,12 +1026,12 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); if (remove) - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -1095,11 +1096,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) return ret; if (ctrl->ctrl.icdoff) { + ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); goto destroy_admin; } if (!(ctrl->ctrl.sgls & (1 << 2))) { + ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not supported!\n"); goto destroy_admin; @@ -1111,6 +1114,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); } + if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) { + dev_warn(ctrl->ctrl.device, + "ctrl sqsize %u > max queue size %u, clamping down\n", + ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE); + ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1; + } + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { dev_warn(ctrl->ctrl.device, "sqsize %u > ctrl maxcmd %u, clamping down\n", @@ -1153,7 +1163,7 @@ destroy_io: nvme_rdma_destroy_io_queues(ctrl, new); } destroy_admin: - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); @@ -1193,7 +1203,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2105,7 +2115,7 @@ unmap_qe: return ret; } -static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_rdma_queue *queue = hctx->driver_data; @@ -2231,7 +2241,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&ctrl->reconnect_work); nvme_rdma_teardown_io_queues(ctrl, shutdown); - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else @@ -2385,7 +2395,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, goto out_uninit_ctrl; dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", - ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr); mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3c1c29dd3020..33bc83d8d992 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -926,12 +926,14 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req) static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; + int req_data_len = req->data_len; while (true) { struct page *page = nvme_tcp_req_cur_page(req); size_t offset = nvme_tcp_req_cur_offset(req); size_t len = nvme_tcp_req_cur_length(req); bool last = nvme_tcp_pdu_last_send(req, len); + int req_data_sent = req->data_sent; int ret, flags = MSG_DONTWAIT; if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) @@ -958,7 +960,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) * in the request where we don't want to modify it as we may * compete with the RX path completing the request. */ - if (req->data_sent + ret < req->data_len) + if (req_data_sent + ret < req_data_len) nvme_tcp_advance_req(req, ret); /* fully successful last send in current PDU */ @@ -1048,10 +1050,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; + size_t offset = req->offset; int ret; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { - .iov_base = &req->ddgst + req->offset, + .iov_base = (u8 *)&req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; @@ -1064,7 +1067,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) if (unlikely(ret <= 0)) return ret; - if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) { + if (offset + ret == NVME_TCP_DIGEST_LENGTH) { nvme_tcp_done_send_req(queue); return 1; } @@ -1915,7 +1918,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); error = nvme_init_ctrl_finish(ctrl); if (error) @@ -1924,7 +1927,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) return 0; out_quiesce_queue: - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); @@ -1946,12 +1949,12 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); if (remove) - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1960,7 +1963,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_sync_io_queues(ctrl); @@ -2055,7 +2058,7 @@ destroy_io: nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); @@ -2098,7 +2101,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); - blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_start_admin_queue(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2116,7 +2119,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_stop_admin_queue(ctrl); if (shutdown) nvme_shutdown_ctrl(ctrl); else @@ -2429,7 +2432,7 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) return 0; } -static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; @@ -2582,7 +2585,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, goto out_uninit_ctrl; dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", - ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr); mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index d95010481fce..bfc259e0d7b8 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -233,6 +233,8 @@ out_free: blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, struct nvme_command *c, enum nvme_zone_mgmt_action action) { + memset(c, 0, sizeof(*c)); + c->zms.opcode = nvme_cmd_zone_mgmt_send; c->zms.nsid = cpu_to_le32(ns->head->ns_id); c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index aa6d84d8848e..6fb24746de06 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -264,7 +264,7 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt); desc->state = req->port->ana_state[grpid]; memset(desc->rsvd17, 0, sizeof(desc->rsvd17)); - return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32); + return struct_size(desc, nsids, count); } static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) @@ -278,8 +278,8 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) u16 status; status = NVME_SC_INTERNAL; - desc = kmalloc(sizeof(struct nvme_ana_group_desc) + - NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL); + desc = kmalloc(struct_size(desc, nsids, NVMET_MAX_NAMESPACES), + GFP_KERNEL); if (!desc) goto out; @@ -374,13 +374,19 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->rab = 6; + if (nvmet_is_disc_subsys(ctrl->subsys)) + id->cntrltype = NVME_CTRL_DISC; + else + id->cntrltype = NVME_CTRL_IO; + /* * XXX: figure out how we can assign a IEEE OUI, but until then * the safest is to leave it as zeroes. */ /* we support multiple ports, multiples hosts and ANA: */ - id->cmic = (1 << 0) | (1 << 1) | (1 << 3); + id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL | + NVME_CTRL_CMIC_ANA; /* Limit MDTS according to transport capability */ if (ctrl->ops->get_mdts) @@ -536,7 +542,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) * Our namespace might always be shared. Not just with other * controllers, but also with any other user of the block device. */ - id->nmic = (1 << 0); + id->nmic = NVME_NS_NMIC_SHARED; id->anagrpid = cpu_to_le32(req->ns->anagrpid); memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid)); @@ -1008,7 +1014,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_cmd(req); - if (nvmet_req_subsys(req)->type == NVME_NQN_DISC) + if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) return nvmet_parse_discovery_cmd(req); ret = nvmet_check_ctrl_status(req); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index be5d82421e3a..091a0ca16361 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1233,6 +1233,44 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_model); +static ssize_t nvmet_subsys_attr_discovery_nqn_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%s\n", + nvmet_disc_subsys->subsysnqn); +} + +static ssize_t nvmet_subsys_attr_discovery_nqn_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + char *subsysnqn; + int len; + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + subsysnqn = kmemdup_nul(page, len, GFP_KERNEL); + if (!subsysnqn) + return -ENOMEM; + + /* + * The discovery NQN must be different from subsystem NQN. + */ + if (!strcmp(subsysnqn, subsys->subsysnqn)) { + kfree(subsysnqn); + return -EBUSY; + } + down_write(&nvmet_config_sem); + kfree(nvmet_disc_subsys->subsysnqn); + nvmet_disc_subsys->subsysnqn = subsysnqn; + up_write(&nvmet_config_sem); + + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_discovery_nqn); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, char *page) @@ -1262,6 +1300,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_cntlid_min, &nvmet_subsys_attr_attr_cntlid_max, &nvmet_subsys_attr_attr_model, + &nvmet_subsys_attr_attr_discovery_nqn, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif @@ -1553,6 +1592,8 @@ static void nvmet_port_release(struct config_item *item) { struct nvmet_port *port = to_nvmet_port(item); + /* Let inflight controllers teardown complete */ + flush_scheduled_work(); list_del(&port->global_entry); kfree(port->ana_state); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b8425fa34300..5119c687de68 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1140,7 +1140,7 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) * should verify iosqes,iocqes are zeroed, however that * would break backwards compatibility, so don't enforce it. */ - if (ctrl->subsys->type != NVME_NQN_DISC && + if (!nvmet_is_disc_subsys(ctrl->subsys) && (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) { ctrl->csts = NVME_CSTS_CFS; @@ -1205,7 +1205,10 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) /* CC.EN timeout in 500msec units: */ ctrl->cap |= (15ULL << 24); /* maximum queue entries supported: */ - ctrl->cap |= NVMET_QUEUE_SIZE - 1; + if (ctrl->ops->get_max_queue_size) + ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1; + else + ctrl->cap |= NVMET_QUEUE_SIZE - 1; if (nvmet_is_passthru_subsys(ctrl->subsys)) nvmet_passthrough_override_cap(ctrl); @@ -1278,7 +1281,7 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) if (subsys->allow_any_host) return true; - if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */ + if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */ return true; list_for_each_entry(p, &subsys->hosts, entry) { @@ -1367,6 +1370,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, mutex_init(&ctrl->lock); ctrl->port = req->port; + ctrl->ops = req->ops; INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); @@ -1405,13 +1409,11 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, } ctrl->cntlid = ret; - ctrl->ops = req->ops; - /* * Discovery controllers may use some arbitrary high value * in order to cleanup stale discovery sessions */ - if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato) + if (nvmet_is_disc_subsys(ctrl->subsys) && !kato) kato = NVMET_DISC_KATO_MS; /* keep-alive timeout in seconds */ @@ -1491,7 +1493,8 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, if (!port) return NULL; - if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) { + if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn) || + !strcmp(nvmet_disc_subsys->subsysnqn, subsysnqn)) { if (!kref_get_unless_zero(&nvmet_disc_subsys->ref)) return NULL; return nvmet_disc_subsys; @@ -1538,6 +1541,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, subsys->max_qid = NVMET_NR_QUEUES; break; case NVME_NQN_DISC: + case NVME_NQN_CURR: subsys->max_qid = 0; break; default: diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 7aa62bc6ae84..c2162eef8ce1 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -146,7 +146,7 @@ static size_t discovery_log_entries(struct nvmet_req *req) struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_subsys_link *p; struct nvmet_port *r; - size_t entries = 0; + size_t entries = 1; list_for_each_entry(p, &req->port->subsystems, entry) { if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) @@ -171,6 +171,7 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) u32 numrec = 0; u16 status = 0; void *buffer; + char traddr[NVMF_TRADDR_SIZE]; if (!nvmet_check_transfer_len(req, data_len)) return; @@ -203,15 +204,19 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) status = NVME_SC_INTERNAL; goto out; } - hdr = buffer; - list_for_each_entry(p, &req->port->subsystems, entry) { - char traddr[NVMF_TRADDR_SIZE]; + nvmet_set_disc_traddr(req, req->port, traddr); + + nvmet_format_discovery_entry(hdr, req->port, + nvmet_disc_subsys->subsysnqn, + traddr, NVME_NQN_CURR, numrec); + numrec++; + + list_for_each_entry(p, &req->port->subsystems, entry) { if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) continue; - nvmet_set_disc_traddr(req, req->port, traddr); nvmet_format_discovery_entry(hdr, req->port, p->subsys->subsysnqn, traddr, NVME_NQN_NVME, numrec); @@ -268,6 +273,8 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) memcpy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE), ' '); + id->cntrltype = NVME_CTRL_DISC; + /* no limit on data transfer sizes for now */ id->mdts = 0; id->cntlid = cpu_to_le16(ctrl->cntlid); @@ -387,7 +394,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) int __init nvmet_init_discovery(void) { nvmet_disc_subsys = - nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); + nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_CURR); return PTR_ERR_OR_ZERO(nvmet_disc_subsys); } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 7d0454cee920..70fb587e9413 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -221,7 +221,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) goto out; } - pr_info("creating controller %d for subsystem %s for NQN %s%s.\n", + pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n", + nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, ctrl->pi_support ? " T10-PI is enabled" : ""); req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 0fc2781ab970..70ca9dfc1771 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -5,6 +5,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/module.h> #include "nvmet.h" @@ -86,7 +87,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) ns->bdev = NULL; return ret; } - ns->size = i_size_read(ns->bdev->bd_inode); + ns->size = bdev_nr_bytes(ns->bdev); ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); ns->pi_type = 0; @@ -107,7 +108,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns) { - ns->size = i_size_read(ns->bdev->bd_inode); + ns->size = bdev_nr_bytes(ns->bdev); } u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 1dd1a0fe2e81..6aa30f30b572 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -125,7 +125,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, return call_iter(iocb, &iter); } -static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) +static void nvmet_file_io_done(struct kiocb *iocb, long ret) { struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb); u16 status = NVME_SC_SUCCESS; @@ -222,7 +222,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) } complete: - nvmet_file_io_done(&req->f.iocb, ret, 0); + nvmet_file_io_done(&req->f.iocb, ret); return true; } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 0285ccc7541f..eb1094254c82 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -384,6 +384,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = PTR_ERR(ctrl->ctrl.admin_q); goto out_cleanup_fabrics_q; } + /* reset stopped state for the fresh admin queue */ + clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); error = nvmf_connect_admin_queue(&ctrl->ctrl); if (error) @@ -398,7 +400,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_start_admin_queue(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) @@ -428,7 +430,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_loop_destroy_io_queues(ctrl); } - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_stop_admin_queue(&ctrl->ctrl); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 7143c7fa7464..af193423c10b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -309,6 +309,7 @@ struct nvmet_fabrics_ops { u16 (*install_queue)(struct nvmet_sq *nvme_sq); void (*discovery_chg)(struct nvmet_port *port); u8 (*get_mdts)(const struct nvmet_ctrl *ctrl); + u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl); }; #define NVMET_MAX_INLINE_BIOVEC 8 @@ -576,6 +577,11 @@ static inline struct nvmet_subsys *nvmet_req_subsys(struct nvmet_req *req) return req->sq->ctrl->subsys; } +static inline bool nvmet_is_disc_subsys(struct nvmet_subsys *subsys) +{ + return subsys->type != NVME_NQN_NVME; +} + #ifdef CONFIG_NVME_TARGET_PASSTHRU void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 891174ccd44b..1deb4043e242 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -5,6 +5,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/atomic.h> +#include <linux/blk-integrity.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/err.h> @@ -1818,12 +1819,36 @@ restart: mutex_unlock(&nvmet_rdma_queue_mutex); } +static void nvmet_rdma_destroy_port_queues(struct nvmet_rdma_port *port) +{ + struct nvmet_rdma_queue *queue, *tmp; + struct nvmet_port *nport = port->nport; + + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { + if (queue->port != nport) + continue; + + list_del_init(&queue->queue_list); + __nvmet_rdma_queue_disconnect(queue); + } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) { struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); if (cm_id) rdma_destroy_id(cm_id); + + /* + * Destroy the remaining queues, which are not belong to any + * controller yet. Do it here after the RDMA-CM was destroyed + * guarantees that no new queue will be created. + */ + nvmet_rdma_destroy_port_queues(port); } static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) @@ -1975,6 +2000,11 @@ static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) return NVMET_RDMA_MAX_MDTS; } +static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) +{ + return NVME_RDMA_MAX_QUEUE_SIZE; +} + static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, @@ -1986,6 +2016,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .delete_ctrl = nvmet_rdma_delete_ctrl, .disc_traddr = nvmet_rdma_disc_port_addr, .get_mdts = nvmet_rdma_get_mdts, + .get_max_queue_size = nvmet_rdma_get_max_queue_size, }; static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 07ee347ea3f3..84c387e4bf43 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -702,7 +702,7 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) struct nvmet_tcp_queue *queue = cmd->queue; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { - .iov_base = &cmd->exp_ddgst + cmd->offset, + .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset }; int ret; @@ -1096,7 +1096,7 @@ recv: } if (queue->hdr_digest && - nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) { + nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) { nvmet_tcp_fatal_error(queue); /* fatal */ return -EPROTO; } @@ -1428,6 +1428,7 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) static void nvmet_tcp_release_queue_work(struct work_struct *w) { + struct page *page; struct nvmet_tcp_queue *queue = container_of(w, struct nvmet_tcp_queue, release_work); @@ -1447,6 +1448,8 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) nvmet_tcp_free_crypto(queue); ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + page = virt_to_head_page(queue->pf_cache.va); + __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); kfree(queue); } @@ -1737,6 +1740,17 @@ err_port: return ret; } +static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port) +{ + struct nvmet_tcp_queue *queue; + + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + if (queue->port == port) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); +} + static void nvmet_tcp_remove_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port = nport->priv; @@ -1746,6 +1760,11 @@ static void nvmet_tcp_remove_port(struct nvmet_port *nport) port->sock->sk->sk_user_data = NULL; write_unlock_bh(&port->sock->sk->sk_callback_lock); cancel_work_sync(&port->accept_work); + /* + * Destroy the remaining queues, which are not belong to any + * controller yet. + */ + nvmet_tcp_destroy_port_queues(port); sock_release(port->sock); kfree(port); diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 3d87fadaa160..8976da38b375 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -1383,7 +1383,8 @@ static void nvmem_shift_read_buffer_in_place(struct nvmem_cell *cell, void *buf) *p-- = 0; /* clear msb bits if any leftover in the last byte */ - *p &= GENMASK((cell->nbits%BITS_PER_BYTE) - 1, 0); + if (cell->nbits % BITS_PER_BYTE) + *p &= GENMASK((cell->nbits % BITS_PER_BYTE) - 1, 0); } static int __nvmem_cell_read(struct nvmem_device *nvmem, diff --git a/drivers/of/base.c b/drivers/of/base.c index f720c0d246f2..0ac17256258d 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -36,6 +36,7 @@ LIST_HEAD(aliases_lookup); struct device_node *of_root; EXPORT_SYMBOL(of_root); struct device_node *of_chosen; +EXPORT_SYMBOL(of_chosen); struct device_node *of_aliases; struct device_node *of_stdout; static const char *of_stdout_options; diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 59c1390cdf42..9da8835ba5a5 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -21,6 +21,7 @@ #include <linux/sort.h> #include <linux/slab.h> #include <linux/memblock.h> +#include <linux/kmemleak.h> #include "of_private.h" @@ -46,6 +47,7 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, err = memblock_mark_nomap(base, size); if (err) memblock_free(base, size); + kmemleak_ignore_phys(base); } return err; diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index eaec915ffe62..67c46e52c0dc 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3301,9 +3301,17 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) return 0; if (!keep_devs) { - /* Delete any children which might still exist. */ + struct list_head removed; + + /* Move all present children to the list on stack */ + INIT_LIST_HEAD(&removed); spin_lock_irqsave(&hbus->device_list_lock, flags); - list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) { + list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) + list_move_tail(&hpdev->list_entry, &removed); + spin_unlock_irqrestore(&hbus->device_list_lock, flags); + + /* Remove all children in the list */ + list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { list_del(&hpdev->list_entry); if (hpdev->pci_slot) pci_destroy_slot(hpdev->pci_slot); @@ -3311,7 +3319,6 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) put_pcichild(hpdev); put_pcichild(hpdev); } - spin_unlock_irqrestore(&hbus->device_list_lock, flags); } ret = hv_send_resources_released(hdev); diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index 014868752cd4..dcefdb42ac46 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -62,14 +62,7 @@ static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value) struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev, hotplug_slot); - switch (zdev->state) { - case ZPCI_FN_STATE_STANDBY: - *value = 0; - break; - default: - *value = 1; - break; - } + *value = zpci_is_device_configured(zdev) ? 1 : 0; return 0; } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 0099a00af361..4b4792940e86 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -535,6 +535,7 @@ static int msi_verify_entries(struct pci_dev *dev) static int msi_capability_init(struct pci_dev *dev, int nvec, struct irq_affinity *affd) { + const struct attribute_group **groups; struct msi_desc *entry; int ret; @@ -558,12 +559,14 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, if (ret) goto err; - dev->msi_irq_groups = msi_populate_sysfs(&dev->dev); - if (IS_ERR(dev->msi_irq_groups)) { - ret = PTR_ERR(dev->msi_irq_groups); + groups = msi_populate_sysfs(&dev->dev); + if (IS_ERR(groups)) { + ret = PTR_ERR(groups); goto err; } + dev->msi_irq_groups = groups; + /* Set MSI enabled bits */ pci_intx_for_msi(dev, 0); pci_msi_set_enable(dev, 1); @@ -691,6 +694,7 @@ static void msix_mask_all(void __iomem *base, int tsize) static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, int nvec, struct irq_affinity *affd) { + const struct attribute_group **groups; void __iomem *base; int ret, tsize; u16 control; @@ -730,12 +734,14 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, msix_update_entries(dev, entries); - dev->msi_irq_groups = msi_populate_sysfs(&dev->dev); - if (IS_ERR(dev->msi_irq_groups)) { - ret = PTR_ERR(dev->msi_irq_groups); + groups = msi_populate_sysfs(&dev->dev); + if (IS_ERR(groups)) { + ret = PTR_ERR(groups); goto out_free; } + dev->msi_irq_groups = groups; + /* Set MSI-X enabled bits and unmask the function */ pci_intx_for_msi(dev, 0); dev->msix_enabled = 1; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 0f40943a9a18..260a06fb78a6 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1249,6 +1249,9 @@ static struct acpi_device *acpi_pci_find_companion(struct device *dev) bool check_children; u64 addr; + if (!dev->parent) + return NULL; + down_read(&pci_acpi_companion_lookup_sem); adev = pci_acpi_find_companion_hook ? diff --git a/drivers/pinctrl/bcm/pinctrl-ns.c b/drivers/pinctrl/bcm/pinctrl-ns.c index e79690bd8b85..d7f8175d2c1c 100644 --- a/drivers/pinctrl/bcm/pinctrl-ns.c +++ b/drivers/pinctrl/bcm/pinctrl-ns.c @@ -5,7 +5,6 @@ #include <linux/err.h> #include <linux/io.h> -#include <linux/mfd/syscon.h> #include <linux/module.h> #include <linux/of.h> #include <linux/of_device.h> @@ -13,7 +12,6 @@ #include <linux/pinctrl/pinctrl.h> #include <linux/pinctrl/pinmux.h> #include <linux/platform_device.h> -#include <linux/regmap.h> #include <linux/slab.h> #define FLAG_BCM4708 BIT(1) @@ -24,8 +22,7 @@ struct ns_pinctrl { struct device *dev; unsigned int chipset_flag; struct pinctrl_dev *pctldev; - struct regmap *regmap; - u32 offset; + void __iomem *base; struct pinctrl_desc pctldesc; struct ns_pinctrl_group *groups; @@ -232,9 +229,9 @@ static int ns_pinctrl_set_mux(struct pinctrl_dev *pctrl_dev, unset |= BIT(pin_number); } - regmap_read(ns_pinctrl->regmap, ns_pinctrl->offset, &tmp); + tmp = readl(ns_pinctrl->base); tmp &= ~unset; - regmap_write(ns_pinctrl->regmap, ns_pinctrl->offset, tmp); + writel(tmp, ns_pinctrl->base); return 0; } @@ -266,13 +263,13 @@ static const struct of_device_id ns_pinctrl_of_match_table[] = { static int ns_pinctrl_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; const struct of_device_id *of_id; struct ns_pinctrl *ns_pinctrl; struct pinctrl_desc *pctldesc; struct pinctrl_pin_desc *pin; struct ns_pinctrl_group *group; struct ns_pinctrl_function *function; + struct resource *res; int i; ns_pinctrl = devm_kzalloc(dev, sizeof(*ns_pinctrl), GFP_KERNEL); @@ -290,18 +287,12 @@ static int ns_pinctrl_probe(struct platform_device *pdev) return -EINVAL; ns_pinctrl->chipset_flag = (uintptr_t)of_id->data; - ns_pinctrl->regmap = syscon_node_to_regmap(of_get_parent(np)); - if (IS_ERR(ns_pinctrl->regmap)) { - int err = PTR_ERR(ns_pinctrl->regmap); - - dev_err(dev, "Failed to map pinctrl regs: %d\n", err); - - return err; - } - - if (of_property_read_u32(np, "offset", &ns_pinctrl->offset)) { - dev_err(dev, "Failed to get register offset\n"); - return -ENOENT; + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "cru_gpio_control"); + ns_pinctrl->base = devm_ioremap_resource(dev, res); + if (IS_ERR(ns_pinctrl->base)) { + dev_err(dev, "Failed to map pinctrl regs\n"); + return PTR_ERR(ns_pinctrl->base); } memcpy(pctldesc, &ns_pinctrl_desc, sizeof(*pctldesc)); diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 8d0f88e9ca88..bae9d429b813 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -840,6 +840,34 @@ static const struct pinconf_ops amd_pinconf_ops = { .pin_config_group_set = amd_pinconf_group_set, }; +static void amd_gpio_irq_init(struct amd_gpio *gpio_dev) +{ + struct pinctrl_desc *desc = gpio_dev->pctrl->desc; + unsigned long flags; + u32 pin_reg, mask; + int i; + + mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3) | + BIT(INTERRUPT_MASK_OFF) | BIT(INTERRUPT_ENABLE_OFF) | + BIT(WAKE_CNTRL_OFF_S4); + + for (i = 0; i < desc->npins; i++) { + int pin = desc->pins[i].number; + const struct pin_desc *pd = pin_desc_get(gpio_dev->pctrl, pin); + + if (!pd) + continue; + + raw_spin_lock_irqsave(&gpio_dev->lock, flags); + + pin_reg = readl(gpio_dev->base + i * 4); + pin_reg &= ~mask; + writel(pin_reg, gpio_dev->base + i * 4); + + raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); + } +} + #ifdef CONFIG_PM_SLEEP static bool amd_gpio_should_save(struct amd_gpio *gpio_dev, unsigned int pin) { @@ -976,6 +1004,9 @@ static int amd_gpio_probe(struct platform_device *pdev) return PTR_ERR(gpio_dev->pctrl); } + /* Disable and mask interrupts */ + amd_gpio_irq_init(gpio_dev); + girq = &gpio_dev->gc.irq; girq->chip = &amd_gpio_irqchip; /* This will let us handle the parent IRQ in the driver */ diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index 32ea2a8ec02b..5ff4207df66e 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -3,7 +3,8 @@ if (ARCH_QCOM || COMPILE_TEST) config PINCTRL_MSM tristate "Qualcomm core pin controller driver" - depends on GPIOLIB && (QCOM_SCM || !QCOM_SCM) #if QCOM_SCM=m this can't be =y + depends on GPIOLIB + select QCOM_SCM select PINMUX select PINCONF select GENERIC_PINCONF diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 68b3886f9f0f..dfd8888a222a 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1644,8 +1644,8 @@ int __maybe_unused stm32_pinctrl_resume(struct device *dev) struct stm32_pinctrl_group *g = pctl->groups; int i; - for (i = g->pin; i < g->pin + pctl->ngroups; i++) - stm32_pinctrl_restore_gpio_regs(pctl, i); + for (i = 0; i < pctl->ngroups; i++, g++) + stm32_pinctrl_restore_gpio_regs(pctl, g->pin); return 0; } diff --git a/drivers/platform/mellanox/mlxreg-io.c b/drivers/platform/mellanox/mlxreg-io.c index 7646708d57e4..a916cd89cbbe 100644 --- a/drivers/platform/mellanox/mlxreg-io.c +++ b/drivers/platform/mellanox/mlxreg-io.c @@ -98,7 +98,7 @@ mlxreg_io_get_reg(void *regmap, struct mlxreg_core_data *data, u32 in_val, if (ret) goto access_error; - *regval |= rol32(val, regsize * i); + *regval |= rol32(val, regsize * i * 8); } } @@ -141,7 +141,7 @@ mlxreg_io_attr_store(struct device *dev, struct device_attribute *attr, return -EINVAL; /* Convert buffer to input value. */ - ret = kstrtou32(buf, len, &input_val); + ret = kstrtou32(buf, 0, &input_val); if (ret) return ret; diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index d6a7c896ac86..fc95620101e8 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -476,6 +476,7 @@ static const struct acpi_device_id amd_pmc_acpi_ids[] = { {"AMDI0006", 0}, {"AMDI0007", 0}, {"AMD0004", 0}, + {"AMD0005", 0}, { } }; MODULE_DEVICE_TABLE(acpi, amd_pmc_acpi_ids); diff --git a/drivers/platform/x86/dell/Kconfig b/drivers/platform/x86/dell/Kconfig index 42513eab1d06..2fffa57e596e 100644 --- a/drivers/platform/x86/dell/Kconfig +++ b/drivers/platform/x86/dell/Kconfig @@ -167,6 +167,7 @@ config DELL_WMI config DELL_WMI_PRIVACY bool "Dell WMI Hardware Privacy Support" depends on LEDS_TRIGGER_AUDIO = y || DELL_WMI = LEDS_TRIGGER_AUDIO + depends on DELL_WMI help This option adds integration with the "Dell Hardware Privacy" feature of Dell laptops to the dell-wmi driver. diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c index d53634c8a6e0..658bab4b7964 100644 --- a/drivers/platform/x86/gigabyte-wmi.c +++ b/drivers/platform/x86/gigabyte-wmi.c @@ -141,6 +141,7 @@ static u8 gigabyte_wmi_detect_sensor_usability(struct wmi_device *wdev) static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M S2H V2"), + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE AX V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 GAMING X V2"), diff --git a/drivers/platform/x86/intel/int1092/intel_sar.c b/drivers/platform/x86/intel/int1092/intel_sar.c index 379560fe5df9..e03943e6380a 100644 --- a/drivers/platform/x86/intel/int1092/intel_sar.c +++ b/drivers/platform/x86/intel/int1092/intel_sar.c @@ -42,12 +42,20 @@ static void update_sar_data(struct wwan_sar_context *context) if (config->device_mode_info && context->sar_data.device_mode < config->total_dev_mode) { - struct wwan_device_mode_info *dev_mode = - &config->device_mode_info[context->sar_data.device_mode]; - - context->sar_data.antennatable_index = dev_mode->antennatable_index; - context->sar_data.bandtable_index = dev_mode->bandtable_index; - context->sar_data.sartable_index = dev_mode->sartable_index; + int itr = 0; + + for (itr = 0; itr < config->total_dev_mode; itr++) { + if (context->sar_data.device_mode == + config->device_mode_info[itr].device_mode) { + struct wwan_device_mode_info *dev_mode = + &config->device_mode_info[itr]; + + context->sar_data.antennatable_index = dev_mode->antennatable_index; + context->sar_data.bandtable_index = dev_mode->bandtable_index; + context->sar_data.sartable_index = dev_mode->sartable_index; + break; + } + } } } @@ -305,7 +313,6 @@ static struct platform_driver sar_driver = { .remove = sar_remove, .driver = { .name = DRVNAME, - .owner = THIS_MODULE, .acpi_match_table = ACPI_PTR(sar_device_ids) } }; @@ -313,4 +320,4 @@ module_platform_driver(sar_driver); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Platform device driver for INTEL MODEM BIOS SAR"); -MODULE_AUTHOR("Shravan S <s.shravan@intel.com>"); +MODULE_AUTHOR("Shravan Sudhakar <s.shravan@intel.com>"); diff --git a/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c b/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c index 9fe0a2527e1c..e59d79c7e82f 100644 --- a/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c +++ b/drivers/platform/x86/intel/int3472/intel_skl_int3472_discrete.c @@ -401,7 +401,7 @@ int skl_int3472_discrete_remove(struct platform_device *pdev) gpiod_remove_lookup_table(&int3472->gpios); - if (int3472->clock.ena_gpio) + if (int3472->clock.cl) skl_int3472_unregister_clock(int3472); gpiod_put(int3472->clock.ena_gpio); diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index bfa0cc20750d..7cc9089d1e14 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -75,7 +75,7 @@ struct intel_scu_ipc_dev { #define IPC_READ_BUFFER 0x90 /* Timeout in jiffies */ -#define IPC_TIMEOUT (5 * HZ) +#define IPC_TIMEOUT (10 * HZ) static struct intel_scu_ipc_dev *ipcdev; /* Only one for now */ static DEFINE_MUTEX(ipclock); /* lock used to prevent multiple call to SCU */ @@ -232,7 +232,7 @@ static inline u32 ipc_data_readl(struct intel_scu_ipc_dev *scu, u32 offset) /* Wait till scu status is busy */ static inline int busy_loop(struct intel_scu_ipc_dev *scu) { - unsigned long end = jiffies + msecs_to_jiffies(IPC_TIMEOUT); + unsigned long end = jiffies + IPC_TIMEOUT; do { u32 status; @@ -247,7 +247,7 @@ static inline int busy_loop(struct intel_scu_ipc_dev *scu) return -ETIMEDOUT; } -/* Wait till ipc ioc interrupt is received or timeout in 3 HZ */ +/* Wait till ipc ioc interrupt is received or timeout in 10 HZ */ static inline int ipc_wait_for_interrupt(struct intel_scu_ipc_dev *scu) { int status; diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 4dfc52e06704..f9b2d66b0443 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -170,6 +170,7 @@ static void ptp_clock_release(struct device *dev) struct ptp_clock *ptp = container_of(dev, struct ptp_clock, dev); ptp_cleanup_pin_groups(ptp); + kfree(ptp->vclock_index); mutex_destroy(&ptp->tsevq_mux); mutex_destroy(&ptp->pincfg_mux); mutex_destroy(&ptp->n_vclocks_mux); @@ -283,15 +284,20 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, /* Create a posix clock and link it to the device. */ err = posix_clock_register(&ptp->clock, &ptp->dev); if (err) { + if (ptp->pps_source) + pps_unregister_source(ptp->pps_source); + + if (ptp->kworker) + kthread_destroy_worker(ptp->kworker); + + put_device(&ptp->dev); + pr_err("failed to create posix clock\n"); - goto no_clock; + return ERR_PTR(err); } return ptp; -no_clock: - if (ptp->pps_source) - pps_unregister_source(ptp->pps_source); no_pps: ptp_cleanup_pin_groups(ptp); no_pin_groups: @@ -321,8 +327,6 @@ int ptp_clock_unregister(struct ptp_clock *ptp) ptp->defunct = 1; wake_up_interruptible(&ptp->tsev_wq); - kfree(ptp->vclock_index); - if (ptp->kworker) { kthread_cancel_delayed_work_sync(&ptp->aux_work); kthread_destroy_worker(ptp->kworker); diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c index d0096cd7096a..4991054a2135 100644 --- a/drivers/ptp/ptp_kvm_x86.c +++ b/drivers/ptp/ptp_kvm_x86.c @@ -31,10 +31,10 @@ int kvm_arch_ptp_init(void) ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); - if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP) + if (ret == -KVM_ENOSYS) return -ENODEV; - return 0; + return ret; } int kvm_arch_ptp_get_clock(struct timespec64 *ts) diff --git a/drivers/ptp/ptp_pch.c b/drivers/ptp/ptp_pch.c index a17e8cc642c5..8070f3fd98f0 100644 --- a/drivers/ptp/ptp_pch.c +++ b/drivers/ptp/ptp_pch.c @@ -644,6 +644,7 @@ static const struct pci_device_id pch_ieee1588_pcidev_id[] = { }, {0} }; +MODULE_DEVICE_TABLE(pci, pch_ieee1588_pcidev_id); static SIMPLE_DEV_PM_OPS(pch_pm_ops, pch_suspend, pch_resume); diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index ca6caba8a191..f4d441b1a8bf 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -145,7 +145,7 @@ static inline int regulator_lock_nested(struct regulator_dev *rdev, mutex_lock(®ulator_nesting_mutex); - if (ww_ctx || !ww_mutex_trylock(&rdev->mutex)) { + if (!ww_mutex_trylock(&rdev->mutex, ww_ctx)) { if (rdev->mutex_owner == current) rdev->ref_cnt++; else diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index be799a5abf8a..b0056ae5d463 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -147,8 +147,8 @@ config RESET_OXNAS bool config RESET_PISTACHIO - bool "Pistachio Reset Driver" if COMPILE_TEST - default MACH_PISTACHIO + bool "Pistachio Reset Driver" + depends on MIPS || COMPILE_TEST help This enables the reset driver for ImgTec Pistachio SoCs. diff --git a/drivers/reset/reset-brcmstb-rescal.c b/drivers/reset/reset-brcmstb-rescal.c index b6f074d6a65f..433fa0c40e47 100644 --- a/drivers/reset/reset-brcmstb-rescal.c +++ b/drivers/reset/reset-brcmstb-rescal.c @@ -38,7 +38,7 @@ static int brcm_rescal_reset_set(struct reset_controller_dev *rcdev, } ret = readl_poll_timeout(base + BRCM_RESCAL_STATUS, reg, - !(reg & BRCM_RESCAL_STATUS_BIT), 100, 1000); + (reg & BRCM_RESCAL_STATUS_BIT), 100, 1000); if (ret) { dev_err(data->dev, "time out on SATA/PCIe rescal\n"); return ret; diff --git a/drivers/reset/reset-socfpga.c b/drivers/reset/reset-socfpga.c index 2a72f861f798..8c6492e5693c 100644 --- a/drivers/reset/reset-socfpga.c +++ b/drivers/reset/reset-socfpga.c @@ -92,3 +92,29 @@ void __init socfpga_reset_init(void) for_each_matching_node(np, socfpga_early_reset_dt_ids) a10_reset_init(np); } + +/* + * The early driver is problematic, because it doesn't register + * itself as a driver. This causes certain device links to prevent + * consumer devices from probing. The hacky solution is to register + * an empty driver, whose only job is to attach itself to the reset + * manager and call probe. + */ +static const struct of_device_id socfpga_reset_dt_ids[] = { + { .compatible = "altr,rst-mgr", }, + { /* sentinel */ }, +}; + +static int reset_simple_probe(struct platform_device *pdev) +{ + return 0; +} + +static struct platform_driver reset_socfpga_driver = { + .probe = reset_simple_probe, + .driver = { + .name = "socfpga-reset", + .of_match_table = socfpga_reset_dt_ids, + }, +}; +builtin_platform_driver(reset_socfpga_driver); diff --git a/drivers/reset/tegra/reset-bpmp.c b/drivers/reset/tegra/reset-bpmp.c index 24d3395964cc..4c5bba52b105 100644 --- a/drivers/reset/tegra/reset-bpmp.c +++ b/drivers/reset/tegra/reset-bpmp.c @@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc); struct mrq_reset_request request; struct tegra_bpmp_message msg; + int err; memset(&request, 0, sizeof(request)); request.cmd = command; @@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, msg.tx.data = &request; msg.tx.size = sizeof(request); - return tegra_bpmp_transfer(bpmp, &msg); + err = tegra_bpmp_transfer(bpmp, &msg); + if (err) + return err; + if (msg.rx.ret) + return -EINVAL; + + return 0; } static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc, diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index e34c6cc61983..8e87a31e329d 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2077,12 +2077,15 @@ static void __dasd_device_check_path_events(struct dasd_device *device) if (device->stopped & ~(DASD_STOPPED_DC_WAIT)) return; + + dasd_path_clear_all_verify(device); + dasd_path_clear_all_fcsec(device); + rc = device->discipline->pe_handler(device, tbvpm, fcsecpm); if (rc) { + dasd_path_add_tbvpm(device, tbvpm); + dasd_path_add_fcsecpm(device, fcsecpm); dasd_device_set_timer(device, 50); - } else { - dasd_path_clear_all_verify(device); - dasd_path_clear_all_fcsec(device); } }; diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index 4691a3c35d72..299001ad9a32 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -201,7 +201,7 @@ dasd_3990_erp_DCTL(struct dasd_ccw_req * erp, char modifier) struct ccw1 *ccw; struct dasd_ccw_req *dctl_cqr; - dctl_cqr = dasd_alloc_erp_request((char *) &erp->magic, 1, + dctl_cqr = dasd_alloc_erp_request(erp->magic, 1, sizeof(struct DCTL_data), device); if (IS_ERR(dctl_cqr)) { @@ -1652,7 +1652,7 @@ dasd_3990_erp_action_1B_32(struct dasd_ccw_req * default_erp, char *sense) } /* Build new ERP request including DE/LO */ - erp = dasd_alloc_erp_request((char *) &cqr->magic, + erp = dasd_alloc_erp_request(cqr->magic, 2 + 1,/* DE/LO + TIC */ sizeof(struct DE_eckd_data) + sizeof(struct LO_eckd_data), device); @@ -2388,7 +2388,7 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr) } /* allocate additional request block */ - erp = dasd_alloc_erp_request((char *) &cqr->magic, + erp = dasd_alloc_erp_request(cqr->magic, cplength, datasize, device); if (IS_ERR(erp)) { if (cqr->retries <= 0) { diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 460e0f1cca53..8410a25a65c1 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -560,8 +560,8 @@ static int prefix_LRE(struct ccw1 *ccw, struct PFX_eckd_data *pfxdata, return -EINVAL; } pfxdata->format = format; - pfxdata->base_address = basepriv->ned->unit_addr; - pfxdata->base_lss = basepriv->ned->ID; + pfxdata->base_address = basepriv->conf.ned->unit_addr; + pfxdata->base_lss = basepriv->conf.ned->ID; pfxdata->validity.define_extent = 1; /* private uid is kept up to date, conf_data may be outdated */ @@ -736,32 +736,30 @@ dasd_eckd_cdl_reclen(int recid) return LABEL_SIZE; } /* create unique id from private structure. */ -static void create_uid(struct dasd_eckd_private *private) +static void create_uid(struct dasd_conf *conf, struct dasd_uid *uid) { int count; - struct dasd_uid *uid; - uid = &private->uid; memset(uid, 0, sizeof(struct dasd_uid)); - memcpy(uid->vendor, private->ned->HDA_manufacturer, + memcpy(uid->vendor, conf->ned->HDA_manufacturer, sizeof(uid->vendor) - 1); EBCASC(uid->vendor, sizeof(uid->vendor) - 1); - memcpy(uid->serial, &private->ned->serial, + memcpy(uid->serial, &conf->ned->serial, sizeof(uid->serial) - 1); EBCASC(uid->serial, sizeof(uid->serial) - 1); - uid->ssid = private->gneq->subsystemID; - uid->real_unit_addr = private->ned->unit_addr; - if (private->sneq) { - uid->type = private->sneq->sua_flags; + uid->ssid = conf->gneq->subsystemID; + uid->real_unit_addr = conf->ned->unit_addr; + if (conf->sneq) { + uid->type = conf->sneq->sua_flags; if (uid->type == UA_BASE_PAV_ALIAS) - uid->base_unit_addr = private->sneq->base_unit_addr; + uid->base_unit_addr = conf->sneq->base_unit_addr; } else { uid->type = UA_BASE_DEVICE; } - if (private->vdsneq) { + if (conf->vdsneq) { for (count = 0; count < 16; count++) { sprintf(uid->vduit+2*count, "%02x", - private->vdsneq->uit[count]); + conf->vdsneq->uit[count]); } } } @@ -776,10 +774,10 @@ static int dasd_eckd_generate_uid(struct dasd_device *device) if (!private) return -ENODEV; - if (!private->ned || !private->gneq) + if (!private->conf.ned || !private->conf.gneq) return -ENODEV; spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags); - create_uid(private); + create_uid(&private->conf, &private->uid); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); return 0; } @@ -803,14 +801,15 @@ static int dasd_eckd_get_uid(struct dasd_device *device, struct dasd_uid *uid) * return 0 for match */ static int dasd_eckd_compare_path_uid(struct dasd_device *device, - struct dasd_eckd_private *private) + struct dasd_conf *path_conf) { struct dasd_uid device_uid; + struct dasd_uid path_uid; - create_uid(private); + create_uid(path_conf, &path_uid); dasd_eckd_get_uid(device, &device_uid); - return memcmp(&device_uid, &private->uid, sizeof(struct dasd_uid)); + return memcmp(&device_uid, &path_uid, sizeof(struct dasd_uid)); } static void dasd_eckd_fill_rcd_cqr(struct dasd_device *device, @@ -946,34 +945,34 @@ out_error: return ret; } -static int dasd_eckd_identify_conf_parts(struct dasd_eckd_private *private) +static int dasd_eckd_identify_conf_parts(struct dasd_conf *conf) { struct dasd_sneq *sneq; int i, count; - private->ned = NULL; - private->sneq = NULL; - private->vdsneq = NULL; - private->gneq = NULL; - count = private->conf_len / sizeof(struct dasd_sneq); - sneq = (struct dasd_sneq *)private->conf_data; + conf->ned = NULL; + conf->sneq = NULL; + conf->vdsneq = NULL; + conf->gneq = NULL; + count = conf->len / sizeof(struct dasd_sneq); + sneq = (struct dasd_sneq *)conf->data; for (i = 0; i < count; ++i) { if (sneq->flags.identifier == 1 && sneq->format == 1) - private->sneq = sneq; + conf->sneq = sneq; else if (sneq->flags.identifier == 1 && sneq->format == 4) - private->vdsneq = (struct vd_sneq *)sneq; + conf->vdsneq = (struct vd_sneq *)sneq; else if (sneq->flags.identifier == 2) - private->gneq = (struct dasd_gneq *)sneq; + conf->gneq = (struct dasd_gneq *)sneq; else if (sneq->flags.identifier == 3 && sneq->res1 == 1) - private->ned = (struct dasd_ned *)sneq; + conf->ned = (struct dasd_ned *)sneq; sneq++; } - if (!private->ned || !private->gneq) { - private->ned = NULL; - private->sneq = NULL; - private->vdsneq = NULL; - private->gneq = NULL; + if (!conf->ned || !conf->gneq) { + conf->ned = NULL; + conf->sneq = NULL; + conf->vdsneq = NULL; + conf->gneq = NULL; return -EINVAL; } return 0; @@ -1016,9 +1015,9 @@ static void dasd_eckd_store_conf_data(struct dasd_device *device, * with the new one if this points to the same data */ cdp = device->path[chp].conf_data; - if (private->conf_data == cdp) { - private->conf_data = (void *)conf_data; - dasd_eckd_identify_conf_parts(private); + if (private->conf.data == cdp) { + private->conf.data = (void *)conf_data; + dasd_eckd_identify_conf_parts(&private->conf); } ccw_device_get_schid(device->cdev, &sch_id); device->path[chp].conf_data = conf_data; @@ -1036,8 +1035,8 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device) struct dasd_eckd_private *private = device->private; int i; - private->conf_data = NULL; - private->conf_len = 0; + private->conf.data = NULL; + private->conf.len = 0; for (i = 0; i < 8; i++) { kfree(device->path[i].conf_data); device->path[i].conf_data = NULL; @@ -1071,15 +1070,55 @@ static void dasd_eckd_read_fc_security(struct dasd_device *device) } } +static void dasd_eckd_get_uid_string(struct dasd_conf *conf, + char *print_uid) +{ + struct dasd_uid uid; + + create_uid(conf, &uid); + if (strlen(uid.vduit) > 0) + snprintf(print_uid, sizeof(*print_uid), + "%s.%s.%04x.%02x.%s", + uid.vendor, uid.serial, uid.ssid, + uid.real_unit_addr, uid.vduit); + else + snprintf(print_uid, sizeof(*print_uid), + "%s.%s.%04x.%02x", + uid.vendor, uid.serial, uid.ssid, + uid.real_unit_addr); +} + +static int dasd_eckd_check_cabling(struct dasd_device *device, + void *conf_data, __u8 lpm) +{ + struct dasd_eckd_private *private = device->private; + char print_path_uid[60], print_device_uid[60]; + struct dasd_conf path_conf; + + path_conf.data = conf_data; + path_conf.len = DASD_ECKD_RCD_DATA_SIZE; + if (dasd_eckd_identify_conf_parts(&path_conf)) + return 1; + + if (dasd_eckd_compare_path_uid(device, &path_conf)) { + dasd_eckd_get_uid_string(&path_conf, print_path_uid); + dasd_eckd_get_uid_string(&private->conf, print_device_uid); + dev_err(&device->cdev->dev, + "Not all channel paths lead to the same device, path %02X leads to device %s instead of %s\n", + lpm, print_path_uid, print_device_uid); + return 1; + } + + return 0; +} + static int dasd_eckd_read_conf(struct dasd_device *device) { void *conf_data; int conf_len, conf_data_saved; int rc, path_err, pos; __u8 lpm, opm; - struct dasd_eckd_private *private, path_private; - struct dasd_uid *uid; - char print_path_uid[60], print_device_uid[60]; + struct dasd_eckd_private *private; private = device->private; opm = ccw_device_get_path_mask(device->cdev); @@ -1109,11 +1148,11 @@ static int dasd_eckd_read_conf(struct dasd_device *device) if (!conf_data_saved) { /* initially clear previously stored conf_data */ dasd_eckd_clear_conf_data(device); - private->conf_data = conf_data; - private->conf_len = conf_len; - if (dasd_eckd_identify_conf_parts(private)) { - private->conf_data = NULL; - private->conf_len = 0; + private->conf.data = conf_data; + private->conf.len = conf_len; + if (dasd_eckd_identify_conf_parts(&private->conf)) { + private->conf.data = NULL; + private->conf.len = 0; kfree(conf_data); continue; } @@ -1123,59 +1162,11 @@ static int dasd_eckd_read_conf(struct dasd_device *device) */ dasd_eckd_generate_uid(device); conf_data_saved++; - } else { - path_private.conf_data = conf_data; - path_private.conf_len = DASD_ECKD_RCD_DATA_SIZE; - if (dasd_eckd_identify_conf_parts( - &path_private)) { - path_private.conf_data = NULL; - path_private.conf_len = 0; - kfree(conf_data); - continue; - } - if (dasd_eckd_compare_path_uid( - device, &path_private)) { - uid = &path_private.uid; - if (strlen(uid->vduit) > 0) - snprintf(print_path_uid, - sizeof(print_path_uid), - "%s.%s.%04x.%02x.%s", - uid->vendor, uid->serial, - uid->ssid, uid->real_unit_addr, - uid->vduit); - else - snprintf(print_path_uid, - sizeof(print_path_uid), - "%s.%s.%04x.%02x", - uid->vendor, uid->serial, - uid->ssid, - uid->real_unit_addr); - uid = &private->uid; - if (strlen(uid->vduit) > 0) - snprintf(print_device_uid, - sizeof(print_device_uid), - "%s.%s.%04x.%02x.%s", - uid->vendor, uid->serial, - uid->ssid, uid->real_unit_addr, - uid->vduit); - else - snprintf(print_device_uid, - sizeof(print_device_uid), - "%s.%s.%04x.%02x", - uid->vendor, uid->serial, - uid->ssid, - uid->real_unit_addr); - dev_err(&device->cdev->dev, - "Not all channel paths lead to " - "the same device, path %02X leads to " - "device %s instead of %s\n", lpm, - print_path_uid, print_device_uid); - path_err = -EINVAL; - dasd_path_add_cablepm(device, lpm); - continue; - } - path_private.conf_data = NULL; - path_private.conf_len = 0; + } else if (dasd_eckd_check_cabling(device, conf_data, lpm)) { + dasd_path_add_cablepm(device, lpm); + path_err = -EINVAL; + kfree(conf_data); + continue; } pos = pathmask_to_pos(lpm); @@ -1197,8 +1188,6 @@ static int dasd_eckd_read_conf(struct dasd_device *device) } } - dasd_eckd_read_fc_security(device); - return path_err; } @@ -1213,7 +1202,7 @@ static u32 get_fcx_max_data(struct dasd_device *device) return 0; /* is transport mode supported? */ fcx_in_css = css_general_characteristics.fcx; - fcx_in_gneq = private->gneq->reserved2[7] & 0x04; + fcx_in_gneq = private->conf.gneq->reserved2[7] & 0x04; fcx_in_features = private->features.feature[40] & 0x80; tpm = fcx_in_css && fcx_in_gneq && fcx_in_features; @@ -1282,9 +1271,9 @@ static int rebuild_device_uid(struct dasd_device *device, "returned error %d", rc); break; } - memcpy(private->conf_data, data->rcd_buffer, + memcpy(private->conf.data, data->rcd_buffer, DASD_ECKD_RCD_DATA_SIZE); - if (dasd_eckd_identify_conf_parts(private)) { + if (dasd_eckd_identify_conf_parts(&private->conf)) { rc = -ENODEV; } else /* first valid path is enough */ break; @@ -1299,11 +1288,10 @@ static int rebuild_device_uid(struct dasd_device *device, static void dasd_eckd_path_available_action(struct dasd_device *device, struct pe_handler_work_data *data) { - struct dasd_eckd_private path_private; - struct dasd_uid *uid; __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE]; __u8 lpm, opm, npm, ppm, epm, hpfpm, cablepm; struct dasd_conf_data *conf_data; + struct dasd_conf path_conf; unsigned long flags; char print_uid[60]; int rc, pos; @@ -1367,11 +1355,11 @@ static void dasd_eckd_path_available_action(struct dasd_device *device, */ memcpy(&path_rcd_buf, data->rcd_buffer, DASD_ECKD_RCD_DATA_SIZE); - path_private.conf_data = (void *) &path_rcd_buf; - path_private.conf_len = DASD_ECKD_RCD_DATA_SIZE; - if (dasd_eckd_identify_conf_parts(&path_private)) { - path_private.conf_data = NULL; - path_private.conf_len = 0; + path_conf.data = (void *)&path_rcd_buf; + path_conf.len = DASD_ECKD_RCD_DATA_SIZE; + if (dasd_eckd_identify_conf_parts(&path_conf)) { + path_conf.data = NULL; + path_conf.len = 0; continue; } @@ -1382,7 +1370,7 @@ static void dasd_eckd_path_available_action(struct dasd_device *device, * the first working path UID will be used as device UID */ if (dasd_path_get_opm(device) && - dasd_eckd_compare_path_uid(device, &path_private)) { + dasd_eckd_compare_path_uid(device, &path_conf)) { /* * the comparison was not successful * rebuild the device UID with at least one @@ -1396,20 +1384,8 @@ static void dasd_eckd_path_available_action(struct dasd_device *device, */ if (rebuild_device_uid(device, data) || dasd_eckd_compare_path_uid( - device, &path_private)) { - uid = &path_private.uid; - if (strlen(uid->vduit) > 0) - snprintf(print_uid, sizeof(print_uid), - "%s.%s.%04x.%02x.%s", - uid->vendor, uid->serial, - uid->ssid, uid->real_unit_addr, - uid->vduit); - else - snprintf(print_uid, sizeof(print_uid), - "%s.%s.%04x.%02x", - uid->vendor, uid->serial, - uid->ssid, - uid->real_unit_addr); + device, &path_conf)) { + dasd_eckd_get_uid_string(&path_conf, print_uid); dev_err(&device->cdev->dev, "The newly added channel path %02X " "will not be used because it leads " @@ -1427,6 +1403,14 @@ static void dasd_eckd_path_available_action(struct dasd_device *device, if (conf_data) { memcpy(conf_data, data->rcd_buffer, DASD_ECKD_RCD_DATA_SIZE); + } else { + /* + * path is operational but path config data could not + * be stored due to low mem condition + * add it to the error path mask and schedule a path + * verification later that this could be added again + */ + epm |= lpm; } pos = pathmask_to_pos(lpm); dasd_eckd_store_conf_data(device, conf_data, pos); @@ -1447,7 +1431,10 @@ static void dasd_eckd_path_available_action(struct dasd_device *device, } dasd_path_add_nppm(device, npm); dasd_path_add_ppm(device, ppm); - dasd_path_add_tbvpm(device, epm); + if (epm) { + dasd_path_add_tbvpm(device, epm); + dasd_device_set_timer(device, 50); + } dasd_path_add_cablepm(device, cablepm); dasd_path_add_nohpfpm(device, hpfpm); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); @@ -1625,8 +1612,8 @@ static int dasd_eckd_read_vol_info(struct dasd_device *device) prssdp = cqr->data; prssdp->order = PSF_ORDER_PRSSD; prssdp->suborder = PSF_SUBORDER_VSQ; /* Volume Storage Query */ - prssdp->lss = private->ned->ID; - prssdp->volume = private->ned->unit_addr; + prssdp->lss = private->conf.ned->ID; + prssdp->volume = private->conf.ned->unit_addr; ccw = cqr->cpaddr; ccw->cmd_code = DASD_ECKD_CCW_PSF; @@ -2085,11 +2072,11 @@ dasd_eckd_check_characteristics(struct dasd_device *device) device->path_thrhld = DASD_ECKD_PATH_THRHLD; device->path_interval = DASD_ECKD_PATH_INTERVAL; - if (private->gneq) { + if (private->conf.gneq) { value = 1; - for (i = 0; i < private->gneq->timeout.value; i++) + for (i = 0; i < private->conf.gneq->timeout.value; i++) value = 10 * value; - value = value * private->gneq->timeout.number; + value = value * private->conf.gneq->timeout.number; /* do not accept useless values */ if (value != 0 && value <= DASD_EXPIRES_MAX) device->default_expires = value; @@ -2121,6 +2108,7 @@ dasd_eckd_check_characteristics(struct dasd_device *device) if (rc) goto out_err3; + dasd_eckd_read_fc_security(device); dasd_path_create_kobjects(device); /* Read Feature Codes */ @@ -2195,10 +2183,10 @@ static void dasd_eckd_uncheck_device(struct dasd_device *device) return; dasd_alias_disconnect_device_from_lcu(device); - private->ned = NULL; - private->sneq = NULL; - private->vdsneq = NULL; - private->gneq = NULL; + private->conf.ned = NULL; + private->conf.sneq = NULL; + private->conf.vdsneq = NULL; + private->conf.gneq = NULL; dasd_eckd_clear_conf_data(device); dasd_path_remove_kobjects(device); } @@ -3750,8 +3738,8 @@ dasd_eckd_dso_ras(struct dasd_device *device, struct dasd_block *block, * subset. */ ras_data->op_flags.guarantee_init = !!(features->feature[56] & 0x01); - ras_data->lss = private->ned->ID; - ras_data->dev_addr = private->ned->unit_addr; + ras_data->lss = private->conf.ned->ID; + ras_data->dev_addr = private->conf.ned->unit_addr; ras_data->nr_exts = nr_exts; if (by_extent) { @@ -4293,8 +4281,8 @@ static int prepare_itcw(struct itcw *itcw, memset(&pfxdata, 0, sizeof(pfxdata)); pfxdata.format = 1; /* PFX with LRE */ - pfxdata.base_address = basepriv->ned->unit_addr; - pfxdata.base_lss = basepriv->ned->ID; + pfxdata.base_address = basepriv->conf.ned->unit_addr; + pfxdata.base_lss = basepriv->conf.ned->ID; pfxdata.validity.define_extent = 1; /* private uid is kept up to date, conf_data may be outdated */ @@ -4963,9 +4951,9 @@ dasd_eckd_fill_info(struct dasd_device * device, info->characteristics_size = sizeof(private->rdc_data); memcpy(info->characteristics, &private->rdc_data, sizeof(private->rdc_data)); - info->confdata_size = min((unsigned long)private->conf_len, - sizeof(info->configuration_data)); - memcpy(info->configuration_data, private->conf_data, + info->confdata_size = min_t(unsigned long, private->conf.len, + sizeof(info->configuration_data)); + memcpy(info->configuration_data, private->conf.data, info->confdata_size); return 0; } @@ -5808,6 +5796,8 @@ static int dasd_eckd_reload_device(struct dasd_device *device) if (rc) goto out_err; + dasd_eckd_read_fc_security(device); + rc = dasd_eckd_generate_uid(device); if (rc) goto out_err; @@ -5820,15 +5810,7 @@ static int dasd_eckd_reload_device(struct dasd_device *device) dasd_eckd_get_uid(device, &uid); if (old_base != uid.base_unit_addr) { - if (strlen(uid.vduit) > 0) - snprintf(print_uid, sizeof(print_uid), - "%s.%s.%04x.%02x.%s", uid.vendor, uid.serial, - uid.ssid, uid.base_unit_addr, uid.vduit); - else - snprintf(print_uid, sizeof(print_uid), - "%s.%s.%04x.%02x", uid.vendor, uid.serial, - uid.ssid, uid.base_unit_addr); - + dasd_eckd_get_uid_string(&private->conf, print_uid); dev_info(&device->cdev->dev, "An Alias device was reassigned to a new base device " "with UID: %s\n", print_uid); @@ -5966,8 +5948,8 @@ static int dasd_eckd_query_host_access(struct dasd_device *device, prssdp->order = PSF_ORDER_PRSSD; prssdp->suborder = PSF_SUBORDER_QHA; /* query host access */ /* LSS and Volume that will be queried */ - prssdp->lss = private->ned->ID; - prssdp->volume = private->ned->unit_addr; + prssdp->lss = private->conf.ned->ID; + prssdp->volume = private->conf.ned->unit_addr; /* all other bytes of prssdp must be zero */ ccw = cqr->cpaddr; diff --git a/drivers/s390/block/dasd_eckd.h b/drivers/s390/block/dasd_eckd.h index 65e4630ad2ae..a91b265441cc 100644 --- a/drivers/s390/block/dasd_eckd.h +++ b/drivers/s390/block/dasd_eckd.h @@ -658,16 +658,19 @@ struct dasd_conf_data { struct dasd_gneq gneq; } __packed; -struct dasd_eckd_private { - struct dasd_eckd_characteristics rdc_data; - u8 *conf_data; - int conf_len; - +struct dasd_conf { + u8 *data; + int len; /* pointers to specific parts in the conf_data */ struct dasd_ned *ned; struct dasd_sneq *sneq; struct vd_sneq *vdsneq; struct dasd_gneq *gneq; +}; + +struct dasd_eckd_private { + struct dasd_eckd_characteristics rdc_data; + struct dasd_conf conf; struct eckd_count count_area[5]; int init_cqr_status; diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c index ba4fa372d02d..c07e6e713518 100644 --- a/drivers/s390/block/dasd_erp.c +++ b/drivers/s390/block/dasd_erp.c @@ -24,7 +24,7 @@ #include "dasd_int.h" struct dasd_ccw_req * -dasd_alloc_erp_request(char *magic, int cplength, int datasize, +dasd_alloc_erp_request(unsigned int magic, int cplength, int datasize, struct dasd_device * device) { unsigned long flags; @@ -33,8 +33,8 @@ dasd_alloc_erp_request(char *magic, int cplength, int datasize, int size; /* Sanity checks */ - BUG_ON( magic == NULL || datasize > PAGE_SIZE || - (cplength*sizeof(struct ccw1)) > PAGE_SIZE); + BUG_ON(datasize > PAGE_SIZE || + (cplength*sizeof(struct ccw1)) > PAGE_SIZE); size = (sizeof(struct dasd_ccw_req) + 7L) & -8L; if (cplength > 0) @@ -62,7 +62,7 @@ dasd_alloc_erp_request(char *magic, int cplength, int datasize, cqr->data = data; memset(cqr->data, 0, datasize); } - strncpy((char *) &cqr->magic, magic, 4); + cqr->magic = magic; ASCEBC((char *) &cqr->magic, 4); set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); dasd_get_device(device); diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index fa966e0db6ca..3a6f3af240fa 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -14,6 +14,7 @@ #define KMSG_COMPONENT "dasd" #include <linux/interrupt.h> +#include <linux/major.h> #include <linux/fs.h> #include <linux/blkpg.h> diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 155428bfed8a..8b458010f88a 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -887,7 +887,7 @@ void dasd_proc_exit(void); /* externals in dasd_erp.c */ struct dasd_ccw_req *dasd_default_erp_action(struct dasd_ccw_req *); struct dasd_ccw_req *dasd_default_erp_postaction(struct dasd_ccw_req *); -struct dasd_ccw_req *dasd_alloc_erp_request(char *, int, int, +struct dasd_ccw_req *dasd_alloc_erp_request(unsigned int, int, int, struct dasd_device *); void dasd_free_erp_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_log_sense(struct dasd_ccw_req *, struct irb *); @@ -1305,6 +1305,15 @@ static inline void dasd_path_add_ppm(struct dasd_device *device, __u8 pm) dasd_path_preferred(device, chp); } +static inline void dasd_path_add_fcsecpm(struct dasd_device *device, __u8 pm) +{ + int chp; + + for (chp = 0; chp < 8; chp++) + if (pm & (0x80 >> chp)) + dasd_path_fcsec(device, chp); +} + /* * set functions for path masks * the existing path mask will be replaced by the given path mask diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 468cbeb539ff..95349f95758c 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -650,8 +650,8 @@ int dasd_ioctl(struct block_device *bdev, fmode_t mode, /** * dasd_biodasdinfo() - fill out the dasd information structure - * @disk [in]: pointer to gendisk structure that references a DASD - * @info [out]: pointer to the dasd_information2_t structure + * @disk: [in] pointer to gendisk structure that references a DASD + * @info: [out] pointer to the dasd_information2_t structure * * Provide access to DASD specific information. * The gendisk structure is checked if it belongs to the DASD driver by diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 5be3d1c39a78..59e513d34b0f 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -30,7 +30,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); -static blk_qc_t dcssblk_submit_bio(struct bio *bio); +static void dcssblk_submit_bio(struct bio *bio); static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); @@ -854,7 +854,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode) up_write(&dcssblk_devices_sem); } -static blk_qc_t +static void dcssblk_submit_bio(struct bio *bio) { struct dcssblk_dev_info *dev_info; @@ -907,10 +907,9 @@ dcssblk_submit_bio(struct bio *bio) bytes_done += bvec.bv_len; } bio_endio(bio); - return BLK_QC_T_NONE; + return; fail: bio_io_error(bio); - return BLK_QC_T_NONE; } static long diff --git a/drivers/scsi/arm/acornscsi.c b/drivers/scsi/arm/acornscsi.c index b4cb5fb19998..0cc62c1b0825 100644 --- a/drivers/scsi/arm/acornscsi.c +++ b/drivers/scsi/arm/acornscsi.c @@ -1776,7 +1776,7 @@ int acornscsi_reconnect_finish(AS_Host *host) host->scsi.disconnectable = 0; if (host->SCpnt->device->id == host->scsi.reconnected.target && host->SCpnt->device->lun == host->scsi.reconnected.lun && - scsi_cmd_to_tag(host->SCpnt) == host->scsi.reconnected.tag) { + scsi_cmd_to_rq(host->SCpnt)->tag == host->scsi.reconnected.tag) { #if (DEBUG & (DEBUG_QUEUES|DEBUG_DISCON)) DBG(host->SCpnt, printk("scsi%d.%c: reconnected", host->host->host_no, acornscsi_target(host))); diff --git a/drivers/scsi/elx/efct/efct_scsi.c b/drivers/scsi/elx/efct/efct_scsi.c index 40fb3a724c76..cf2e41dd354c 100644 --- a/drivers/scsi/elx/efct/efct_scsi.c +++ b/drivers/scsi/elx/efct/efct_scsi.c @@ -32,7 +32,7 @@ efct_scsi_io_alloc(struct efct_node *node) struct efct *efct; struct efct_xport *xport; struct efct_io *io; - unsigned long flags = 0; + unsigned long flags; efct = node->efct; @@ -44,7 +44,6 @@ efct_scsi_io_alloc(struct efct_node *node) if (!io) { efc_log_err(efct, "IO alloc Failed\n"); atomic_add_return(1, &xport->io_alloc_failed_count); - spin_unlock_irqrestore(&node->active_ios_lock, flags); return NULL; } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 3ab669dc806f..27884f3106ab 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3,6 +3,7 @@ * Copyright (c) 2017 Hisilicon Limited. */ +#include <linux/sched/clock.h> #include "hisi_sas.h" #define DRV_NAME "hisi_sas_v3_hw" diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 3f6f14f0cafb..24b72ee4246f 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -220,7 +220,8 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, goto fail; } - shost->cmd_per_lun = min_t(short, shost->cmd_per_lun, + /* Use min_t(int, ...) in case shost->can_queue exceeds SHRT_MAX */ + shost->cmd_per_lun = min_t(int, shost->cmd_per_lun, shost->can_queue); error = scsi_init_sense_cache(shost); diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 1f1586ad48fe..01f79991bf4a 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -1696,6 +1696,7 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt, spin_lock_irqsave(&evt->queue->l_lock, flags); list_add_tail(&evt->queue_list, &evt->queue->sent); + atomic_set(&evt->active, 1); mb(); @@ -1710,6 +1711,7 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt, be64_to_cpu(crq_as_u64[1])); if (rc) { + atomic_set(&evt->active, 0); list_del(&evt->queue_list); spin_unlock_irqrestore(&evt->queue->l_lock, flags); del_timer(&evt->timer); @@ -1737,7 +1739,6 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt, evt->done(evt); } else { - atomic_set(&evt->active, 1); spin_unlock_irqrestore(&evt->queue->l_lock, flags); ibmvfc_trc_start(evt); } diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 4683c183e9d4..5bc91d34df63 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2281,11 +2281,6 @@ int iscsi_eh_abort(struct scsi_cmnd *sc) return FAILED; } - conn = session->leadconn; - iscsi_get_conn(conn->cls_conn); - conn->eh_abort_cnt++; - age = session->age; - spin_lock(&session->back_lock); task = (struct iscsi_task *)sc->SCp.ptr; if (!task || !task->sc) { @@ -2293,8 +2288,16 @@ int iscsi_eh_abort(struct scsi_cmnd *sc) ISCSI_DBG_EH(session, "sc completed while abort in progress\n"); spin_unlock(&session->back_lock); - goto success; + spin_unlock_bh(&session->frwd_lock); + mutex_unlock(&session->eh_mutex); + return SUCCESS; } + + conn = session->leadconn; + iscsi_get_conn(conn->cls_conn); + conn->eh_abort_cnt++; + age = session->age; + ISCSI_DBG_EH(session, "aborting [sc %p itt 0x%x]\n", sc, task->itt); __iscsi_get_task(task); spin_unlock(&session->back_lock); diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index befeb7c34290..337e6ed24821 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -22,6 +22,7 @@ *******************************************************************/ #include <scsi/scsi_host.h> +#include <linux/hashtable.h> #include <linux/ktime.h> #include <linux/workqueue.h> diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 78ce38d7251c..026a1196a54d 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -12292,12 +12292,12 @@ void lpfc_ignore_els_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, struct lpfc_iocbq *rspiocb) { - struct lpfc_nodelist *ndlp = (struct lpfc_nodelist *) cmdiocb->context1; + struct lpfc_nodelist *ndlp = NULL; IOCB_t *irsp = &rspiocb->iocb; /* ELS cmd tag <ulpIoTag> completes */ lpfc_printf_log(phba, KERN_INFO, LOG_ELS, - "0139 Ignoring ELS cmd tag x%x completion Data: " + "0139 Ignoring ELS cmd code x%x completion Data: " "x%x x%x x%x\n", irsp->ulpIoTag, irsp->ulpStatus, irsp->un.ulpWord[4], irsp->ulpTimeout); @@ -12305,10 +12305,13 @@ lpfc_ignore_els_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, * Deref the ndlp after free_iocb. sli_release_iocb will access the ndlp * if exchange is busy. */ - if (cmdiocb->iocb.ulpCommand == CMD_GEN_REQUEST64_CR) + if (cmdiocb->iocb.ulpCommand == CMD_GEN_REQUEST64_CR) { + ndlp = cmdiocb->context_un.ndlp; lpfc_ct_free_iocb(phba, cmdiocb); - else + } else { + ndlp = (struct lpfc_nodelist *) cmdiocb->context1; lpfc_els_free_iocb(phba, cmdiocb); + } lpfc_nlp_put(ndlp); } diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index 2197988333fe..3cae8803383b 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -3736,7 +3736,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id) shost->max_lun = -1; shost->unique_id = mrioc->id; - shost->max_channel = 1; + shost->max_channel = 0; shost->max_id = 0xFFFFFFFF; if (prot_mask >= 0) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index d383d4a03436..ad1b6c2b37a7 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -5065,9 +5065,12 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd, if (scmd->prot_flags & SCSI_PROT_GUARD_CHECK) eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD; - if (scmd->prot_flags & SCSI_PROT_REF_CHECK) { - eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG | - MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG; + if (scmd->prot_flags & SCSI_PROT_REF_CHECK) + eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG; + + if (scmd->prot_flags & SCSI_PROT_REF_INCREMENT) { + eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG; + mpi_request->CDB.EEDP32.PrimaryReferenceTag = cpu_to_be32(scsi_prot_ref_tag(scmd)); } diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c index 4b5d28d89d69..655cf5de604b 100644 --- a/drivers/scsi/qla2xxx/qla_bsg.c +++ b/drivers/scsi/qla2xxx/qla_bsg.c @@ -431,7 +431,7 @@ done_unmap_sg: goto done_free_fcport; done_free_fcport: - if (bsg_request->msgcode == FC_BSG_RPT_ELS) + if (bsg_request->msgcode != FC_BSG_RPT_ELS) qla2x00_free_fcport(fcport); done: return rval; diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 1c5da2dbd6f9..253055cf9daf 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -8,6 +8,8 @@ #include <linux/delay.h> #include <linux/nvme.h> #include <linux/nvme-fc.h> +#include <linux/blk-mq-pci.h> +#include <linux/blk-mq.h> static struct nvme_fc_port_template qla_nvme_fc_transport; @@ -642,6 +644,18 @@ static int qla_nvme_post_cmd(struct nvme_fc_local_port *lport, return rval; } +static void qla_nvme_map_queues(struct nvme_fc_local_port *lport, + struct blk_mq_queue_map *map) +{ + struct scsi_qla_host *vha = lport->private; + int rc; + + rc = blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset); + if (rc) + ql_log(ql_log_warn, vha, 0x21de, + "pci map queue failed 0x%x", rc); +} + static void qla_nvme_localport_delete(struct nvme_fc_local_port *lport) { struct scsi_qla_host *vha = lport->private; @@ -676,6 +690,7 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = { .ls_abort = qla_nvme_ls_abort, .fcp_io = qla_nvme_post_cmd, .fcp_abort = qla_nvme_fcp_abort, + .map_queues = qla_nvme_map_queues, .max_hw_queues = 8, .max_sgl_segments = 1024, .max_dif_sgl_segments = 64, diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index d2e40aaba734..836fedcea241 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4157,7 +4157,7 @@ qla2x00_mem_alloc(struct qla_hw_data *ha, uint16_t req_len, uint16_t rsp_len, ql_dbg_pci(ql_dbg_init, ha->pdev, 0xe0ee, "%s: failed alloc dsd\n", __func__); - return 1; + return -ENOMEM; } ha->dif_bundle_kallocs++; diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index b3478ed9b12e..7d8242c120fc 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -3319,8 +3319,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, "RESET-RSP online/active/old-count/new-count = %d/%d/%d/%d.\n", vha->flags.online, qla2x00_reset_active(vha), cmd->reset_count, qpair->chip_reset); - spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); - return 0; + goto out_unmap_unlock; } /* Does F/W have an IOCBs for this request */ @@ -3445,10 +3444,6 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd) prm.sg = NULL; prm.req_cnt = 1; - /* Calculate number of entries and segments required */ - if (qlt_pci_map_calc_cnt(&prm) != 0) - return -EAGAIN; - if (!qpair->fw_started || (cmd->reset_count != qpair->chip_reset) || (cmd->sess && cmd->sess->deleted)) { /* @@ -3466,6 +3461,10 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd) return 0; } + /* Calculate number of entries and segments required */ + if (qlt_pci_map_calc_cnt(&prm) != 0) + return -EAGAIN; + spin_lock_irqsave(qpair->qp_lock_ptr, flags); /* Does F/W have an IOCBs for this request */ res = qlt_check_reserve_free_req(qpair, prm.req_cnt); @@ -3870,9 +3869,6 @@ void qlt_free_cmd(struct qla_tgt_cmd *cmd) BUG_ON(cmd->cmd_in_wq); - if (cmd->sg_mapped) - qlt_unmap_sg(cmd->vha, cmd); - if (!cmd->q_full) qlt_decr_num_pend_cmds(cmd->vha); diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index b241f9e3885c..291ecc33b1fe 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -553,8 +553,10 @@ EXPORT_SYMBOL(scsi_device_get); */ void scsi_device_put(struct scsi_device *sdev) { - module_put(sdev->host->hostt->module); + struct module *mod = sdev->host->hostt->module; + put_device(&sdev->sdev_gendev); + module_put(mod); } EXPORT_SYMBOL(scsi_device_put); diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index 81c3853a2a80..081b84bb7985 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -25,8 +25,8 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, return -EOPNOTSUPP; } - rq = blk_get_request(q, hdr->dout_xfer_len ? - REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); + rq = scsi_alloc_request(q, hdr->dout_xfer_len ? + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); rq->timeout = timeout; @@ -95,7 +95,7 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, out_free_cmd: scsi_req_free_cmd(scsi_req(rq)); out_put_request: - blk_put_request(rq); + blk_mq_free_request(rq); return ret; } diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 66f507469a31..40b473eea357 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -5384,7 +5384,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip, { bool new_sd_dp; bool inject = false; - bool hipri = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_HIPRI; + bool polled = scsi_cmd_to_rq(cmnd)->cmd_flags & REQ_POLLED; int k, num_in_q, qdepth; unsigned long iflags; u64 ns_from_boot = 0; @@ -5471,7 +5471,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip, if (sdebug_host_max_queue) sd_dp->hc_idx = get_tag(cmnd); - if (hipri) + if (polled) ns_from_boot = ktime_get_boottime_ns(); /* one of the resp_*() response functions is called here */ @@ -5531,7 +5531,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip, kt -= d; } } - if (hipri) { + if (polled) { sd_dp->cmpl_ts = ktime_add(ns_to_ktime(ns_from_boot), kt); spin_lock_irqsave(&sqp->qc_lock, iflags); if (!sd_dp->init_poll) { @@ -5562,7 +5562,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip, if (unlikely((sdebug_opts & SDEBUG_OPT_CMD_ABORT) && atomic_read(&sdeb_inject_pending))) sd_dp->aborted = true; - if (hipri) { + if (polled) { sd_dp->cmpl_ts = ns_to_ktime(ns_from_boot); spin_lock_irqsave(&sqp->qc_lock, iflags); if (!sd_dp->init_poll) { @@ -7331,7 +7331,7 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num) if (kt_from_boot < sd_dp->cmpl_ts) continue; - } else /* ignoring non REQ_HIPRI requests */ + } else /* ignoring non REQ_POLLED requests */ continue; devip = (struct sdebug_dev_info *)scp->device->hostdata; if (likely(devip)) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index b6c86cce57bf..36870b41c888 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1979,7 +1979,7 @@ maybe_retry: static void eh_lock_door_done(struct request *req, blk_status_t status) { - blk_put_request(req); + blk_mq_free_request(req); } /** @@ -1998,7 +1998,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) struct request *req; struct scsi_request *rq; - req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN, 0); + req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return; rq = scsi_req(req); diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index 6ff2207bd45a..34412eac4566 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -438,7 +438,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk, at_head = 1; ret = -ENOMEM; - rq = blk_get_request(sdev->request_queue, writing ? + rq = scsi_alloc_request(sdev->request_queue, writing ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); @@ -490,7 +490,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk, out_free_cdb: scsi_req_free_cmd(req); out_put_request: - blk_put_request(rq); + blk_mq_free_request(rq); return ret; } @@ -561,7 +561,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, } - rq = blk_get_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); + rq = scsi_alloc_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -634,7 +634,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, } error: - blk_put_request(rq); + blk_mq_free_request(rq); error_free_buffer: kfree(buffer); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 572673873ddf..9c2b99e12ce3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -21,6 +21,7 @@ #include <linux/hardirq.h> #include <linux/scatterlist.h> #include <linux/blk-mq.h> +#include <linux/blk-integrity.h> #include <linux/ratelimit.h> #include <asm/unaligned.h> @@ -215,7 +216,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, struct scsi_request *rq; int ret; - req = blk_get_request(sdev->request_queue, + req = scsi_alloc_request(sdev->request_queue, data_direction == DMA_TO_DEVICE ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0); @@ -259,7 +260,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, scsi_normalize_sense(rq->sense, rq->sense_len, sshdr); ret = rq->result; out: - blk_put_request(req); + blk_mq_free_request(req); return ret; } @@ -1078,9 +1079,6 @@ EXPORT_SYMBOL(scsi_alloc_sgtables); * This function initializes the members of struct scsi_cmnd that must be * initialized before request processing starts and that won't be * reinitialized if a SCSI command is requeued. - * - * Called from inside blk_get_request() for pass-through requests and from - * inside scsi_init_command() for filesystem requests. */ static void scsi_initialize_rq(struct request *rq) { @@ -1097,6 +1095,18 @@ static void scsi_initialize_rq(struct request *rq) cmd->retries = 0; } +struct request *scsi_alloc_request(struct request_queue *q, + unsigned int op, blk_mq_req_flags_t flags) +{ + struct request *rq; + + rq = blk_mq_alloc_request(q, op, flags); + if (!IS_ERR(rq)) + scsi_initialize_rq(rq); + return rq; +} +EXPORT_SYMBOL_GPL(scsi_alloc_request); + /* * Only called when the request isn't completed by SCSI, and not freed by * SCSI @@ -1783,7 +1793,7 @@ static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq, } -static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx) +static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct Scsi_Host *shost = hctx->driver_data; @@ -1863,7 +1873,6 @@ static const struct blk_mq_ops scsi_mq_ops_no_commit = { #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, - .initialize_rq_fn = scsi_initialize_rq, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, @@ -1893,7 +1902,6 @@ static const struct blk_mq_ops scsi_mq_ops = { #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, - .initialize_rq_fn = scsi_initialize_rq, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, @@ -1959,6 +1967,14 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q) return sdev; } +/* + * pktcdvd should have been integrated into the SCSI layers, but for historical + * reasons like the old IDE driver it isn't. This export allows it to safely + * probe if a given device is a SCSI one and only attach to that. + */ +#ifdef CONFIG_CDROM_PKTCDVD_MODULE +EXPORT_SYMBOL_GPL(scsi_device_from_queue); +#endif /** * scsi_block_requests - Utility function used by low-level drivers to prevent diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index fe22191522a3..2808c0cb5711 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -280,7 +280,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, sdev->request_queue = q; q->queuedata = sdev; __scsi_init_queue(sdev->host, q); - blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q); WARN_ON_ONCE(!blk_get_queue(q)); depth = sdev->host->cmd_per_lun ?: 1; diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 86793259e541..a35841b34bfd 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -449,9 +449,12 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work) struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL; struct scsi_vpd *vpd_pg0 = NULL, *vpd_pg89 = NULL; unsigned long flags; + struct module *mod; sdev = container_of(work, struct scsi_device, ew.work); + mod = sdev->host->hostt->module; + scsi_dh_release_device(sdev); parent = sdev->sdev_gendev.parent; @@ -502,11 +505,17 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work) if (parent) put_device(parent); + module_put(mod); } static void scsi_device_dev_release(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); + + /* Set module pointer as NULL in case of module unloading */ + if (!try_module_get(sdp->host->hostt->module)) + sdp->host->hostt->module = NULL; + execute_in_process_context(scsi_device_dev_release_usercontext, &sdp->ew); } diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 922e4c7bd88e..78343d3f9385 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2930,8 +2930,6 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev) session->recovery_tmo = value; break; default: - err = transport->set_param(conn, ev->u.set_param.param, - data, ev->u.set_param.len); if ((conn->state == ISCSI_CONN_BOUND) || (conn->state == ISCSI_CONN_UP)) { err = transport->set_param(conn, ev->u.set_param.param, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 523bf2fdc253..252e43d7c73f 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -48,6 +48,7 @@ #include <linux/blkpg.h> #include <linux/blk-pm.h> #include <linux/delay.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/string_helpers.h> #include <linux/async.h> @@ -1756,6 +1757,44 @@ static void sd_rescan(struct device *dev) sd_revalidate_disk(sdkp->disk); } +static int sd_get_unique_id(struct gendisk *disk, u8 id[16], + enum blk_unique_id type) +{ + struct scsi_device *sdev = scsi_disk(disk)->device; + const struct scsi_vpd *vpd; + const unsigned char *d; + int ret = -ENXIO, len; + + rcu_read_lock(); + vpd = rcu_dereference(sdev->vpd_pg83); + if (!vpd) + goto out_unlock; + + ret = -EINVAL; + for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) { + /* we only care about designators with LU association */ + if (((d[1] >> 4) & 0x3) != 0x00) + continue; + if ((d[1] & 0xf) != type) + continue; + + /* + * Only exit early if a 16-byte descriptor was found. Otherwise + * keep looking as one with more entropy might still show up. + */ + len = d[3]; + if (len != 8 && len != 12 && len != 16) + continue; + ret = len; + memcpy(id, d + 4, len); + if (len == 16) + break; + } +out_unlock: + rcu_read_unlock(); + return ret; +} + static char sd_pr_type(enum pr_type type) { switch (type) { @@ -1860,6 +1899,7 @@ static const struct block_device_operations sd_fops = { .check_events = sd_check_events, .unlock_native_capacity = sd_unlock_native_capacity, .report_zones = sd_zbc_report_zones, + .get_unique_id = sd_get_unique_id, .pr_ops = &sd_pr_ops, }; @@ -3087,6 +3127,86 @@ static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer) sdkp->security = 1; } +static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf) +{ + return logical_to_sectors(sdkp->device, get_unaligned_be64(buf)); +} + +/** + * sd_read_cpr - Query concurrent positioning ranges + * @sdkp: disk to query + */ +static void sd_read_cpr(struct scsi_disk *sdkp) +{ + struct blk_independent_access_ranges *iars = NULL; + unsigned char *buffer = NULL; + unsigned int nr_cpr = 0; + int i, vpd_len, buf_len = SD_BUF_SIZE; + u8 *desc; + + /* + * We need to have the capacity set first for the block layer to be + * able to check the ranges. + */ + if (sdkp->first_scan) + return; + + if (!sdkp->capacity) + goto out; + + /* + * Concurrent Positioning Ranges VPD: there can be at most 256 ranges, + * leading to a maximum page size of 64 + 256*32 bytes. + */ + buf_len = 64 + 256*32; + buffer = kmalloc(buf_len, GFP_KERNEL); + if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len)) + goto out; + + /* We must have at least a 64B header and one 32B range descriptor */ + vpd_len = get_unaligned_be16(&buffer[2]) + 3; + if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) { + sd_printk(KERN_ERR, sdkp, + "Invalid Concurrent Positioning Ranges VPD page\n"); + goto out; + } + + nr_cpr = (vpd_len - 64) / 32; + if (nr_cpr == 1) { + nr_cpr = 0; + goto out; + } + + iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr); + if (!iars) { + nr_cpr = 0; + goto out; + } + + desc = &buffer[64]; + for (i = 0; i < nr_cpr; i++, desc += 32) { + if (desc[0] != i) { + sd_printk(KERN_ERR, sdkp, + "Invalid Concurrent Positioning Range number\n"); + nr_cpr = 0; + break; + } + + iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8); + iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16); + } + +out: + disk_set_independent_access_ranges(sdkp->disk, iars); + if (nr_cpr && sdkp->nr_actuators != nr_cpr) { + sd_printk(KERN_NOTICE, sdkp, + "%u concurrent positioning ranges\n", nr_cpr); + sdkp->nr_actuators = nr_cpr; + } + + kfree(buffer); +} + /* * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, not a @@ -3202,6 +3322,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); + sd_read_cpr(sdkp); } /* @@ -3683,7 +3804,12 @@ static int sd_resume(struct device *dev) static int sd_resume_runtime(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); - struct scsi_device *sdp = sdkp->device; + struct scsi_device *sdp; + + if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ + return 0; + + sdp = sdkp->device; if (sdp->ignore_media_change) { /* clear the device's sense data */ diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index b59136c4125b..2e5932bde43d 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -106,6 +106,7 @@ struct scsi_disk { u8 protection_type;/* Data Integrity Field */ u8 provisioning_mode; u8 zeroing_mode; + u8 nr_actuators; /* Number of actuators */ unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 4cadb26070a8..349950616adc 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -6,7 +6,7 @@ * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/t10-pi.h> #include <scsi/scsi.h> diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 8f05248920e8..141099ab9092 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -31,6 +31,7 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include <linux/errno.h> #include <linux/mtio.h> #include <linux/ioctl.h> +#include <linux/major.h> #include <linux/slab.h> #include <linux/fcntl.h> #include <linux/init.h> @@ -814,7 +815,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, if (atomic_read(&sdp->detaching)) { if (srp->bio) { scsi_req_free_cmd(scsi_req(srp->rq)); - blk_put_request(srp->rq); + blk_mq_free_request(srp->rq); srp->rq = NULL; } @@ -1389,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status) */ srp->rq = NULL; scsi_req_free_cmd(scsi_req(rq)); - blk_put_request(rq); + blk_mq_free_request(rq); write_lock_irqsave(&sfp->rq_list_lock, iflags); if (unlikely(srp->orphan)) { @@ -1717,13 +1718,13 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) * * With scsi-mq enabled, there are a fixed number of preallocated * requests equal in number to shost->can_queue. If all of the - * preallocated requests are already in use, then blk_get_request() + * preallocated requests are already in use, then scsi_alloc_request() * will sleep until an active command completes, freeing up a request. * Although waiting in an asynchronous interface is less than ideal, we * do not want to use BLK_MQ_REQ_NOWAIT here because userspace might * not expect an EWOULDBLOCK from this condition. */ - rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ? + rq = scsi_alloc_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) { kfree(long_cmdp); @@ -1829,7 +1830,7 @@ sg_finish_rem_req(Sg_request *srp) if (srp->rq) { scsi_req_free_cmd(scsi_req(srp->rq)); - blk_put_request(srp->rq); + blk_mq_free_request(srp->rq); } if (srp->res_used) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 8b17b35283aa..3009b986d1d7 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -44,6 +44,7 @@ #include <linux/cdrom.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/major.h> #include <linux/blkdev.h> #include <linux/blk-pm.h> #include <linux/mutex.h> @@ -966,7 +967,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf, struct bio *bio; int ret; - rq = blk_get_request(disk->queue, REQ_OP_DRV_IN, 0); + rq = scsi_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); req = scsi_req(rq); @@ -1002,7 +1003,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf, if (blk_rq_unmap_user(bio)) ret = -EFAULT; out_put_request: - blk_put_request(rq); + blk_mq_free_request(rq); return ret; } diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index ae8636d3780b..c2d5608f6b1a 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -32,6 +32,7 @@ static const char *verstr = "20160209"; #include <linux/slab.h> #include <linux/errno.h> #include <linux/mtio.h> +#include <linux/major.h> #include <linux/cdrom.h> #include <linux/ioctl.h> #include <linux/fcntl.h> @@ -529,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status) complete(SRpnt->waiting); blk_rq_unmap_user(tmp); - blk_put_request(req); + blk_mq_free_request(req); } static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, @@ -542,7 +543,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, int err = 0; struct scsi_tape *STp = SRpnt->stp; - req = blk_get_request(SRpnt->stp->device->request_queue, + req = scsi_alloc_request(SRpnt->stp->device->request_queue, data_direction == DMA_TO_DEVICE ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(req)) @@ -556,7 +557,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, err = blk_rq_map_user(req->q, req, mdata, NULL, bufflen, GFP_KERNEL); if (err) { - blk_put_request(req); + blk_mq_free_request(req); return err; } } diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index ebbbc1299c62..9eb1b88a29dd 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1285,11 +1285,15 @@ static void storvsc_on_channel_callback(void *context) foreach_vmbus_pkt(desc, channel) { struct vstor_packet *packet = hv_pkt_data(desc); struct storvsc_cmd_request *request = NULL; + u32 pktlen = hv_pkt_datalen(desc); u64 rqst_id = desc->trans_id; + u32 minlen = rqst_id ? sizeof(struct vstor_packet) - + stor_device->vmscsi_size_delta : sizeof(enum vstor_packet_operation); - if (hv_pkt_datalen(desc) < sizeof(struct vstor_packet) - - stor_device->vmscsi_size_delta) { - dev_err(&device->device, "Invalid packet len\n"); + if (pktlen < minlen) { + dev_err(&device->device, + "Invalid pkt: id=%llu, len=%u, minlen=%u\n", + rqst_id, pktlen, minlen); continue; } @@ -1302,13 +1306,23 @@ static void storvsc_on_channel_callback(void *context) if (rqst_id == 0) { /* * storvsc_on_receive() looks at the vstor_packet in the message - * from the ring buffer. If the operation in the vstor_packet is - * COMPLETE_IO, then we call storvsc_on_io_completion(), and - * dereference the guest memory address. Make sure we don't call - * storvsc_on_io_completion() with a guest memory address that is - * zero if Hyper-V were to construct and send such a bogus packet. + * from the ring buffer. + * + * - If the operation in the vstor_packet is COMPLETE_IO, then + * we call storvsc_on_io_completion(), and dereference the + * guest memory address. Make sure we don't call + * storvsc_on_io_completion() with a guest memory address + * that is zero if Hyper-V were to construct and send such + * a bogus packet. + * + * - If the operation in the vstor_packet is FCHBA_DATA, then + * we call cache_wwn(), and access the data payload area of + * the packet (wwn_packet); however, there is no guarantee + * that the packet is big enough to contain such area. + * Future-proof the code by rejecting such a bogus packet. */ - if (packet->operation == VSTOR_OPERATION_COMPLETE_IO) { + if (packet->operation == VSTOR_OPERATION_COMPLETE_IO || + packet->operation == VSTOR_OPERATION_FCHBA_DATA) { dev_err(&device->device, "Invalid packet with ID of 0\n"); continue; } diff --git a/drivers/scsi/ufs/ufs-exynos.c b/drivers/scsi/ufs/ufs-exynos.c index a14dd8ce56d4..bb2dd79a1bcd 100644 --- a/drivers/scsi/ufs/ufs-exynos.c +++ b/drivers/scsi/ufs/ufs-exynos.c @@ -642,9 +642,9 @@ static int exynos_ufs_pre_pwr_mode(struct ufs_hba *hba, } /* setting for three timeout values for traffic class #0 */ - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA0), 8064); - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA1), 28224); - ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA2), 20160); + ufshcd_dme_set(hba, UIC_ARG_MIB(DL_FC0PROTTIMEOUTVAL), 8064); + ufshcd_dme_set(hba, UIC_ARG_MIB(DL_TC0REPLAYTIMEOUTVAL), 28224); + ufshcd_dme_set(hba, UIC_ARG_MIB(DL_AFC0REQTIMEOUTVAL), 20160); return 0; out: diff --git a/drivers/scsi/ufs/ufshcd-crypto.c b/drivers/scsi/ufs/ufshcd-crypto.c index d70cdcd35e43..67402baf6fae 100644 --- a/drivers/scsi/ufs/ufshcd-crypto.c +++ b/drivers/scsi/ufs/ufshcd-crypto.c @@ -48,11 +48,12 @@ out: return err; } -static int ufshcd_crypto_keyslot_program(struct blk_keyslot_manager *ksm, +static int ufshcd_crypto_keyslot_program(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm); + struct ufs_hba *hba = + container_of(profile, struct ufs_hba, crypto_profile); const union ufs_crypto_cap_entry *ccap_array = hba->crypto_cap_array; const struct ufs_crypto_alg_entry *alg = &ufs_crypto_algs[key->crypto_cfg.crypto_mode]; @@ -105,11 +106,12 @@ static int ufshcd_clear_keyslot(struct ufs_hba *hba, int slot) return ufshcd_program_key(hba, &cfg, slot); } -static int ufshcd_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, +static int ufshcd_crypto_keyslot_evict(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot) { - struct ufs_hba *hba = container_of(ksm, struct ufs_hba, ksm); + struct ufs_hba *hba = + container_of(profile, struct ufs_hba, crypto_profile); return ufshcd_clear_keyslot(hba, slot); } @@ -120,11 +122,11 @@ bool ufshcd_crypto_enable(struct ufs_hba *hba) return false; /* Reset might clear all keys, so reprogram all the keys. */ - blk_ksm_reprogram_all_keys(&hba->ksm); + blk_crypto_reprogram_all_keys(&hba->crypto_profile); return true; } -static const struct blk_ksm_ll_ops ufshcd_ksm_ops = { +static const struct blk_crypto_ll_ops ufshcd_crypto_ops = { .keyslot_program = ufshcd_crypto_keyslot_program, .keyslot_evict = ufshcd_crypto_keyslot_evict, }; @@ -179,15 +181,16 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) } /* The actual number of configurations supported is (CFGC+1) */ - err = devm_blk_ksm_init(hba->dev, &hba->ksm, - hba->crypto_capabilities.config_count + 1); + err = devm_blk_crypto_profile_init( + hba->dev, &hba->crypto_profile, + hba->crypto_capabilities.config_count + 1); if (err) goto out; - hba->ksm.ksm_ll_ops = ufshcd_ksm_ops; + hba->crypto_profile.ll_ops = ufshcd_crypto_ops; /* UFS only supports 8 bytes for any DUN */ - hba->ksm.max_dun_bytes_supported = 8; - hba->ksm.dev = hba->dev; + hba->crypto_profile.max_dun_bytes_supported = 8; + hba->crypto_profile.dev = hba->dev; /* * Cache all the UFS crypto capabilities and advertise the supported @@ -202,7 +205,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) blk_mode_num = ufshcd_find_blk_crypto_mode( hba->crypto_cap_array[cap_idx]); if (blk_mode_num != BLK_ENCRYPTION_MODE_INVALID) - hba->ksm.crypto_modes_supported[blk_mode_num] |= + hba->crypto_profile.modes_supported[blk_mode_num] |= hba->crypto_cap_array[cap_idx].sdus_mask * 512; } @@ -230,9 +233,8 @@ void ufshcd_init_crypto(struct ufs_hba *hba) ufshcd_clear_keyslot(hba, slot); } -void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba, - struct request_queue *q) +void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q) { if (hba->caps & UFSHCD_CAP_CRYPTO) - blk_ksm_register(&hba->ksm, q); + blk_crypto_register(&hba->crypto_profile, q); } diff --git a/drivers/scsi/ufs/ufshcd-crypto.h b/drivers/scsi/ufs/ufshcd-crypto.h index 78a58e788dff..e18c01276873 100644 --- a/drivers/scsi/ufs/ufshcd-crypto.h +++ b/drivers/scsi/ufs/ufshcd-crypto.h @@ -18,7 +18,7 @@ static inline void ufshcd_prepare_lrbp_crypto(struct request *rq, return; } - lrbp->crypto_key_slot = blk_ksm_get_slot_idx(rq->crypt_keyslot); + lrbp->crypto_key_slot = blk_crypto_keyslot_index(rq->crypt_keyslot); lrbp->data_unit_num = rq->crypt_ctx->bc_dun[0]; } @@ -40,8 +40,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba); void ufshcd_init_crypto(struct ufs_hba *hba); -void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba, - struct request_queue *q); +void ufshcd_crypto_register(struct ufs_hba *hba, struct request_queue *q); #else /* CONFIG_SCSI_UFS_CRYPTO */ @@ -64,8 +63,8 @@ static inline int ufshcd_hba_init_crypto_capabilities(struct ufs_hba *hba) static inline void ufshcd_init_crypto(struct ufs_hba *hba) { } -static inline void ufshcd_crypto_setup_rq_keyslot_manager(struct ufs_hba *hba, - struct request_queue *q) { } +static inline void ufshcd_crypto_register(struct ufs_hba *hba, + struct request_queue *q) { } #endif /* CONFIG_SCSI_UFS_CRYPTO */ diff --git a/drivers/scsi/ufs/ufshcd-pci.c b/drivers/scsi/ufs/ufshcd-pci.c index 149c1aa09103..51424557810d 100644 --- a/drivers/scsi/ufs/ufshcd-pci.c +++ b/drivers/scsi/ufs/ufshcd-pci.c @@ -370,20 +370,6 @@ static void ufs_intel_common_exit(struct ufs_hba *hba) static int ufs_intel_resume(struct ufs_hba *hba, enum ufs_pm_op op) { - /* - * To support S4 (suspend-to-disk) with spm_lvl other than 5, the base - * address registers must be restored because the restore kernel can - * have used different addresses. - */ - ufshcd_writel(hba, lower_32_bits(hba->utrdl_dma_addr), - REG_UTP_TRANSFER_REQ_LIST_BASE_L); - ufshcd_writel(hba, upper_32_bits(hba->utrdl_dma_addr), - REG_UTP_TRANSFER_REQ_LIST_BASE_H); - ufshcd_writel(hba, lower_32_bits(hba->utmrdl_dma_addr), - REG_UTP_TASK_REQ_LIST_BASE_L); - ufshcd_writel(hba, upper_32_bits(hba->utmrdl_dma_addr), - REG_UTP_TASK_REQ_LIST_BASE_H); - if (ufshcd_is_link_hibern8(hba)) { int ret = ufshcd_uic_hibern8_exit(hba); @@ -463,6 +449,18 @@ static struct ufs_hba_variant_ops ufs_intel_lkf_hba_vops = { .device_reset = ufs_intel_device_reset, }; +#ifdef CONFIG_PM_SLEEP +static int ufshcd_pci_restore(struct device *dev) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + /* Force a full reset and restore */ + ufshcd_set_link_off(hba); + + return ufshcd_system_resume(dev); +} +#endif + /** * ufshcd_pci_shutdown - main function to put the controller in reset state * @pdev: pointer to PCI device handle @@ -546,9 +544,14 @@ ufshcd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) } static const struct dev_pm_ops ufshcd_pci_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(ufshcd_system_suspend, ufshcd_system_resume) SET_RUNTIME_PM_OPS(ufshcd_runtime_suspend, ufshcd_runtime_resume, NULL) #ifdef CONFIG_PM_SLEEP + .suspend = ufshcd_system_suspend, + .resume = ufshcd_system_resume, + .freeze = ufshcd_system_suspend, + .thaw = ufshcd_system_resume, + .poweroff = ufshcd_system_suspend, + .restore = ufshcd_pci_restore, .prepare = ufshcd_suspend_prepare, .complete = ufshcd_resume_complete, #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 188de6f91050..db1bc8655d46 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -2737,12 +2737,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) lrbp->req_abort_skip = false; - err = ufshpb_prep(hba, lrbp); - if (err == -EAGAIN) { - lrbp->cmd = NULL; - ufshcd_release(hba); - goto out; - } + ufshpb_prep(hba, lrbp); ufshcd_comp_scsi_upiu(hba, lrbp); @@ -2925,7 +2920,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba, * Even though we use wait_event() which sleeps indefinitely, * the maximum wait time is bounded by SCSI request timeout. */ - req = blk_get_request(q, REQ_OP_DRV_OUT, 0); + req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_unlock; @@ -2952,7 +2947,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba, (struct utp_upiu_req *)lrbp->ucd_rsp_ptr); out: - blk_put_request(req); + blk_mq_free_request(req); out_unlock: up_read(&hba->clk_scaling_lock); return err; @@ -4986,7 +4981,7 @@ static int ufshcd_slave_configure(struct scsi_device *sdev) else if (ufshcd_is_rpm_autosuspend_allowed(hba)) sdev->rpm_autosuspend = 1; - ufshcd_crypto_setup_rq_keyslot_manager(hba, q); + ufshcd_crypto_register(hba, q); return 0; } @@ -6377,27 +6372,6 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba, u32 intr_status) return retval; } -struct ctm_info { - struct ufs_hba *hba; - unsigned long pending; - unsigned int ncpl; -}; - -static bool ufshcd_compl_tm(struct request *req, void *priv, bool reserved) -{ - struct ctm_info *const ci = priv; - struct completion *c; - - WARN_ON_ONCE(reserved); - if (test_bit(req->tag, &ci->pending)) - return true; - ci->ncpl++; - c = req->end_io_data; - if (c) - complete(c); - return true; -} - /** * ufshcd_tmc_handler - handle task management function completion * @hba: per adapter instance @@ -6408,18 +6382,24 @@ static bool ufshcd_compl_tm(struct request *req, void *priv, bool reserved) */ static irqreturn_t ufshcd_tmc_handler(struct ufs_hba *hba) { - unsigned long flags; - struct request_queue *q = hba->tmf_queue; - struct ctm_info ci = { - .hba = hba, - }; + unsigned long flags, pending, issued; + irqreturn_t ret = IRQ_NONE; + int tag; + + pending = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL); spin_lock_irqsave(hba->host->host_lock, flags); - ci.pending = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL); - blk_mq_tagset_busy_iter(q->tag_set, ufshcd_compl_tm, &ci); + issued = hba->outstanding_tasks & ~pending; + for_each_set_bit(tag, &issued, hba->nutmrs) { + struct request *req = hba->tmf_rqs[tag]; + struct completion *c = req->end_io_data; + + complete(c); + ret = IRQ_HANDLED; + } spin_unlock_irqrestore(hba->host->host_lock, flags); - return ci.ncpl ? IRQ_HANDLED : IRQ_NONE; + return ret; } /** @@ -6532,9 +6512,9 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, int task_tag, err; /* - * blk_get_request() is used here only to get a free tag. + * blk_mq_alloc_request() is used here only to get a free tag. */ - req = blk_get_request(q, REQ_OP_DRV_OUT, 0); + req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -6542,9 +6522,9 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, ufshcd_hold(hba, false); spin_lock_irqsave(host->host_lock, flags); - blk_mq_start_request(req); task_tag = req->tag; + hba->tmf_rqs[req->tag] = req; treq->upiu_req.req_header.dword_0 |= cpu_to_be32(task_tag); memcpy(hba->utmrdl_base_addr + task_tag, treq, sizeof(*treq)); @@ -6585,11 +6565,12 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, } spin_lock_irqsave(hba->host->host_lock, flags); + hba->tmf_rqs[req->tag] = NULL; __clear_bit(task_tag, &hba->outstanding_tasks); spin_unlock_irqrestore(hba->host->host_lock, flags); ufshcd_release(hba); - blk_put_request(req); + blk_mq_free_request(req); return err; } @@ -6674,7 +6655,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba, down_read(&hba->clk_scaling_lock); - req = blk_get_request(q, REQ_OP_DRV_OUT, 0); + req = blk_mq_alloc_request(q, REQ_OP_DRV_OUT, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_unlock; @@ -6755,7 +6736,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba, (struct utp_upiu_req *)lrbp->ucd_rsp_ptr); out: - blk_put_request(req); + blk_mq_free_request(req); out_unlock: up_read(&hba->clk_scaling_lock); return err; @@ -7926,7 +7907,7 @@ static void ufshcd_request_sense_done(struct request *rq, blk_status_t error) if (error != BLK_STS_OK) pr_err("%s: REQUEST SENSE failed (%d)\n", __func__, error); kfree(rq->end_io_data); - blk_put_request(rq); + blk_mq_free_request(rq); } static int @@ -7946,7 +7927,7 @@ ufshcd_request_sense_async(struct ufs_hba *hba, struct scsi_device *sdev) if (!buffer) return -ENOMEM; - req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN, + req = blk_mq_alloc_request(sdev->request_queue, REQ_OP_DRV_IN, /*flags=*/BLK_MQ_REQ_PM); if (IS_ERR(req)) { ret = PTR_ERR(req); @@ -7971,7 +7952,7 @@ ufshcd_request_sense_async(struct ufs_hba *hba, struct scsi_device *sdev) return 0; out_put: - blk_put_request(req); + blk_mq_free_request(req); out_free: kfree(buffer); return ret; @@ -9635,6 +9616,12 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) err = PTR_ERR(hba->tmf_queue); goto free_tmf_tag_set; } + hba->tmf_rqs = devm_kcalloc(hba->dev, hba->nutmrs, + sizeof(*hba->tmf_rqs), GFP_KERNEL); + if (!hba->tmf_rqs) { + err = -ENOMEM; + goto free_tmf_queue; + } /* Reset the attached device */ ufshcd_device_reset(hba); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index f0da5d3db1fa..62bdc412d38a 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -32,7 +32,7 @@ #include <linux/regulator/consumer.h> #include <linux/bitfield.h> #include <linux/devfreq.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> #include "unipro.h" #include <asm/irq.h> @@ -766,7 +766,7 @@ struct ufs_hba_monitor { * @crypto_capabilities: Content of crypto capabilities register (0x100) * @crypto_cap_array: Array of crypto capabilities * @crypto_cfg_register: Start of the crypto cfg array - * @ksm: the keyslot manager tied to this hba + * @crypto_profile: the crypto profile of this hba (if applicable) */ struct ufs_hba { void __iomem *mmio_base; @@ -828,6 +828,7 @@ struct ufs_hba { struct blk_mq_tag_set tmf_tag_set; struct request_queue *tmf_queue; + struct request **tmf_rqs; struct uic_command *active_uic_cmd; struct mutex uic_cmd_mutex; @@ -910,7 +911,7 @@ struct ufs_hba { union ufs_crypto_capabilities crypto_capabilities; union ufs_crypto_cap_entry *crypto_cap_array; u32 crypto_cfg_register; - struct blk_keyslot_manager ksm; + struct blk_crypto_profile crypto_profile; #endif #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_root; diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c index 589af5f6b940..182bcbf60f8c 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/scsi/ufs/ufshpb.c @@ -84,16 +84,6 @@ static bool ufshpb_is_supported_chunk(struct ufshpb_lu *hpb, int transfer_len) return transfer_len <= hpb->pre_req_max_tr_len; } -/* - * In this driver, WRITE_BUFFER CMD support 36KB (len=9) ~ 1MB (len=256) as - * default. It is possible to change range of transfer_len through sysfs. - */ -static inline bool ufshpb_is_required_wb(struct ufshpb_lu *hpb, int len) -{ - return len > hpb->pre_req_min_tr_len && - len <= hpb->pre_req_max_tr_len; -} - static bool ufshpb_is_general_lun(int lun) { return lun < UFS_UPIU_MAX_UNIT_NUM_ID; @@ -334,7 +324,7 @@ ufshpb_get_pos_from_lpn(struct ufshpb_lu *hpb, unsigned long lpn, int *rgn_idx, static void ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, - __be64 ppn, u8 transfer_len, int read_id) + __be64 ppn, u8 transfer_len) { unsigned char *cdb = lrbp->cmd->cmnd; __be64 ppn_tmp = ppn; @@ -346,256 +336,11 @@ ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, /* ppn value is stored as big-endian in the host memory */ memcpy(&cdb[6], &ppn_tmp, sizeof(__be64)); cdb[14] = transfer_len; - cdb[15] = read_id; + cdb[15] = 0; lrbp->cmd->cmd_len = UFS_CDB_SIZE; } -static inline void ufshpb_set_write_buf_cmd(unsigned char *cdb, - unsigned long lpn, unsigned int len, - int read_id) -{ - cdb[0] = UFSHPB_WRITE_BUFFER; - cdb[1] = UFSHPB_WRITE_BUFFER_PREFETCH_ID; - - put_unaligned_be32(lpn, &cdb[2]); - cdb[6] = read_id; - put_unaligned_be16(len * HPB_ENTRY_SIZE, &cdb[7]); - - cdb[9] = 0x00; /* Control = 0x00 */ -} - -static struct ufshpb_req *ufshpb_get_pre_req(struct ufshpb_lu *hpb) -{ - struct ufshpb_req *pre_req; - - if (hpb->num_inflight_pre_req >= hpb->throttle_pre_req) { - dev_info(&hpb->sdev_ufs_lu->sdev_dev, - "pre_req throttle. inflight %d throttle %d", - hpb->num_inflight_pre_req, hpb->throttle_pre_req); - return NULL; - } - - pre_req = list_first_entry_or_null(&hpb->lh_pre_req_free, - struct ufshpb_req, list_req); - if (!pre_req) { - dev_info(&hpb->sdev_ufs_lu->sdev_dev, "There is no pre_req"); - return NULL; - } - - list_del_init(&pre_req->list_req); - hpb->num_inflight_pre_req++; - - return pre_req; -} - -static inline void ufshpb_put_pre_req(struct ufshpb_lu *hpb, - struct ufshpb_req *pre_req) -{ - pre_req->req = NULL; - bio_reset(pre_req->bio); - list_add_tail(&pre_req->list_req, &hpb->lh_pre_req_free); - hpb->num_inflight_pre_req--; -} - -static void ufshpb_pre_req_compl_fn(struct request *req, blk_status_t error) -{ - struct ufshpb_req *pre_req = (struct ufshpb_req *)req->end_io_data; - struct ufshpb_lu *hpb = pre_req->hpb; - unsigned long flags; - - if (error) { - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - struct scsi_sense_hdr sshdr; - - dev_err(&hpb->sdev_ufs_lu->sdev_dev, "block status %d", error); - scsi_command_normalize_sense(cmd, &sshdr); - dev_err(&hpb->sdev_ufs_lu->sdev_dev, - "code %x sense_key %x asc %x ascq %x", - sshdr.response_code, - sshdr.sense_key, sshdr.asc, sshdr.ascq); - dev_err(&hpb->sdev_ufs_lu->sdev_dev, - "byte4 %x byte5 %x byte6 %x additional_len %x", - sshdr.byte4, sshdr.byte5, - sshdr.byte6, sshdr.additional_length); - } - - blk_mq_free_request(req); - spin_lock_irqsave(&hpb->rgn_state_lock, flags); - ufshpb_put_pre_req(pre_req->hpb, pre_req); - spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); -} - -static int ufshpb_prep_entry(struct ufshpb_req *pre_req, struct page *page) -{ - struct ufshpb_lu *hpb = pre_req->hpb; - struct ufshpb_region *rgn; - struct ufshpb_subregion *srgn; - __be64 *addr; - int offset = 0; - int copied; - unsigned long lpn = pre_req->wb.lpn; - int rgn_idx, srgn_idx, srgn_offset; - unsigned long flags; - - addr = page_address(page); - ufshpb_get_pos_from_lpn(hpb, lpn, &rgn_idx, &srgn_idx, &srgn_offset); - - spin_lock_irqsave(&hpb->rgn_state_lock, flags); - -next_offset: - rgn = hpb->rgn_tbl + rgn_idx; - srgn = rgn->srgn_tbl + srgn_idx; - - if (!ufshpb_is_valid_srgn(rgn, srgn)) - goto mctx_error; - - if (!srgn->mctx) - goto mctx_error; - - copied = ufshpb_fill_ppn_from_page(hpb, srgn->mctx, srgn_offset, - pre_req->wb.len - offset, - &addr[offset]); - - if (copied < 0) - goto mctx_error; - - offset += copied; - srgn_offset += copied; - - if (srgn_offset == hpb->entries_per_srgn) { - srgn_offset = 0; - - if (++srgn_idx == hpb->srgns_per_rgn) { - srgn_idx = 0; - rgn_idx++; - } - } - - if (offset < pre_req->wb.len) - goto next_offset; - - spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); - return 0; -mctx_error: - spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); - return -ENOMEM; -} - -static int ufshpb_pre_req_add_bio_page(struct ufshpb_lu *hpb, - struct request_queue *q, - struct ufshpb_req *pre_req) -{ - struct page *page = pre_req->wb.m_page; - struct bio *bio = pre_req->bio; - int entries_bytes, ret; - - if (!page) - return -ENOMEM; - - if (ufshpb_prep_entry(pre_req, page)) - return -ENOMEM; - - entries_bytes = pre_req->wb.len * sizeof(__be64); - - ret = bio_add_pc_page(q, bio, page, entries_bytes, 0); - if (ret != entries_bytes) { - dev_err(&hpb->sdev_ufs_lu->sdev_dev, - "bio_add_pc_page fail: %d", ret); - return -ENOMEM; - } - return 0; -} - -static inline int ufshpb_get_read_id(struct ufshpb_lu *hpb) -{ - if (++hpb->cur_read_id >= MAX_HPB_READ_ID) - hpb->cur_read_id = 1; - return hpb->cur_read_id; -} - -static int ufshpb_execute_pre_req(struct ufshpb_lu *hpb, struct scsi_cmnd *cmd, - struct ufshpb_req *pre_req, int read_id) -{ - struct scsi_device *sdev = cmd->device; - struct request_queue *q = sdev->request_queue; - struct request *req; - struct scsi_request *rq; - struct bio *bio = pre_req->bio; - - pre_req->hpb = hpb; - pre_req->wb.lpn = sectors_to_logical(cmd->device, - blk_rq_pos(scsi_cmd_to_rq(cmd))); - pre_req->wb.len = sectors_to_logical(cmd->device, - blk_rq_sectors(scsi_cmd_to_rq(cmd))); - if (ufshpb_pre_req_add_bio_page(hpb, q, pre_req)) - return -ENOMEM; - - req = pre_req->req; - - /* 1. request setup */ - blk_rq_append_bio(req, bio); - req->rq_disk = NULL; - req->end_io_data = (void *)pre_req; - req->end_io = ufshpb_pre_req_compl_fn; - - /* 2. scsi_request setup */ - rq = scsi_req(req); - rq->retries = 1; - - ufshpb_set_write_buf_cmd(rq->cmd, pre_req->wb.lpn, pre_req->wb.len, - read_id); - rq->cmd_len = scsi_command_size(rq->cmd); - - if (blk_insert_cloned_request(q, req) != BLK_STS_OK) - return -EAGAIN; - - hpb->stats.pre_req_cnt++; - - return 0; -} - -static int ufshpb_issue_pre_req(struct ufshpb_lu *hpb, struct scsi_cmnd *cmd, - int *read_id) -{ - struct ufshpb_req *pre_req; - struct request *req = NULL; - unsigned long flags; - int _read_id; - int ret = 0; - - req = blk_get_request(cmd->device->request_queue, - REQ_OP_DRV_OUT | REQ_SYNC, BLK_MQ_REQ_NOWAIT); - if (IS_ERR(req)) - return -EAGAIN; - - spin_lock_irqsave(&hpb->rgn_state_lock, flags); - pre_req = ufshpb_get_pre_req(hpb); - if (!pre_req) { - ret = -EAGAIN; - goto unlock_out; - } - _read_id = ufshpb_get_read_id(hpb); - spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); - - pre_req->req = req; - - ret = ufshpb_execute_pre_req(hpb, cmd, pre_req, _read_id); - if (ret) - goto free_pre_req; - - *read_id = _read_id; - - return ret; -free_pre_req: - spin_lock_irqsave(&hpb->rgn_state_lock, flags); - ufshpb_put_pre_req(hpb, pre_req); -unlock_out: - spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); - blk_put_request(req); - return ret; -} - /* * This function will set up HPB read command using host-side L2P map data. */ @@ -609,7 +354,6 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) __be64 ppn; unsigned long flags; int transfer_len, rgn_idx, srgn_idx, srgn_offset; - int read_id = 0; int err = 0; hpb = ufshpb_get_hpb_data(cmd->device); @@ -685,24 +429,8 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) dev_err(hba->dev, "get ppn failed. err %d\n", err); return err; } - if (!ufshpb_is_legacy(hba) && - ufshpb_is_required_wb(hpb, transfer_len)) { - err = ufshpb_issue_pre_req(hpb, cmd, &read_id); - if (err) { - unsigned long timeout; - - timeout = cmd->jiffies_at_alloc + msecs_to_jiffies( - hpb->params.requeue_timeout_ms); - - if (time_before(jiffies, timeout)) - return -EAGAIN; - - hpb->stats.miss_cnt++; - return 0; - } - } - ufshpb_set_hpb_read_to_upiu(hba, lrbp, ppn, transfer_len, read_id); + ufshpb_set_hpb_read_to_upiu(hba, lrbp, ppn, transfer_len); hpb->stats.hit_cnt++; return 0; @@ -721,7 +449,7 @@ static struct ufshpb_req *ufshpb_get_req(struct ufshpb_lu *hpb, return NULL; retry: - req = blk_get_request(hpb->sdev_ufs_lu->request_queue, dir, + req = blk_mq_alloc_request(hpb->sdev_ufs_lu->request_queue, dir, BLK_MQ_REQ_NOWAIT); if (!atomic && (PTR_ERR(req) == -EWOULDBLOCK) && (--retries > 0)) { @@ -745,7 +473,7 @@ free_rq: static void ufshpb_put_req(struct ufshpb_lu *hpb, struct ufshpb_req *rq) { - blk_put_request(rq->req); + blk_mq_free_request(rq->req); kmem_cache_free(hpb->map_req_cache, rq); } @@ -1841,16 +1569,11 @@ static void ufshpb_lu_parameter_init(struct ufs_hba *hba, u32 entries_per_rgn; u64 rgn_mem_size, tmp; - /* for pre_req */ - hpb->pre_req_min_tr_len = hpb_dev_info->max_hpb_single_cmd + 1; - if (ufshpb_is_legacy(hba)) hpb->pre_req_max_tr_len = HPB_LEGACY_CHUNK_HIGH; else hpb->pre_req_max_tr_len = HPB_MULTI_CHUNK_HIGH; - hpb->cur_read_id = 0; - hpb->lu_pinned_start = hpb_lu_info->pinned_start; hpb->lu_pinned_end = hpb_lu_info->num_pinned ? (hpb_lu_info->pinned_start + hpb_lu_info->num_pinned - 1) diff --git a/drivers/scsi/ufs/ufshpb.h b/drivers/scsi/ufs/ufshpb.h index a79e07398970..f15d8fdbce2e 100644 --- a/drivers/scsi/ufs/ufshpb.h +++ b/drivers/scsi/ufs/ufshpb.h @@ -241,8 +241,6 @@ struct ufshpb_lu { spinlock_t param_lock; struct list_head lh_pre_req_free; - int cur_read_id; - int pre_req_min_tr_len; int pre_req_max_tr_len; /* cached L2P map management worker */ diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 07d0250f17c3..b8455fcbf18b 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -22,6 +22,7 @@ #include <linux/virtio_scsi.h> #include <linux/cpu.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <scsi/scsi_host.h> #include <scsi/scsi_device.h> #include <scsi/scsi_cmnd.h> diff --git a/drivers/soc/canaan/Kconfig b/drivers/soc/canaan/Kconfig index 8179b69518b4..853096b7e84c 100644 --- a/drivers/soc/canaan/Kconfig +++ b/drivers/soc/canaan/Kconfig @@ -5,7 +5,6 @@ config SOC_K210_SYSCTL depends on RISCV && SOC_CANAAN && OF default SOC_CANAAN select PM - select SIMPLE_PM_BUS select SYSCON select MFD_SYSCON help diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c index bda170d7b4a2..72fc2b539213 100644 --- a/drivers/soc/qcom/mdt_loader.c +++ b/drivers/soc/qcom/mdt_loader.c @@ -98,7 +98,7 @@ void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len) if (ehdr->e_phnum < 2) return ERR_PTR(-EINVAL); - if (phdrs[0].p_type == PT_LOAD || phdrs[1].p_type == PT_LOAD) + if (phdrs[0].p_type == PT_LOAD) return ERR_PTR(-EINVAL); if ((phdrs[1].p_flags & QCOM_MDT_TYPE_MASK) != QCOM_MDT_TYPE_HASH) diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index 9faf48302f4b..52e581167115 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -628,7 +628,7 @@ static int qcom_socinfo_probe(struct platform_device *pdev) /* Feed the soc specific unique data into entropy pool */ add_device_randomness(info, item_size); - platform_set_drvdata(pdev, qs->soc_dev); + platform_set_drvdata(pdev, qs); return 0; } diff --git a/drivers/soc/ti/omap_prm.c b/drivers/soc/ti/omap_prm.c index ea64e187854e..f32e1cbbe8c5 100644 --- a/drivers/soc/ti/omap_prm.c +++ b/drivers/soc/ti/omap_prm.c @@ -825,25 +825,28 @@ static int omap_reset_deassert(struct reset_controller_dev *rcdev, writel_relaxed(v, reset->prm->base + reset->prm->data->rstctrl); spin_unlock_irqrestore(&reset->lock, flags); - if (!has_rstst) - goto exit; + /* wait for the reset bit to clear */ + ret = readl_relaxed_poll_timeout_atomic(reset->prm->base + + reset->prm->data->rstctrl, + v, !(v & BIT(id)), 1, + OMAP_RESET_MAX_WAIT); + if (ret) + pr_err("%s: timedout waiting for %s:%lu\n", __func__, + reset->prm->data->name, id); /* wait for the status to be set */ - ret = readl_relaxed_poll_timeout_atomic(reset->prm->base + + if (has_rstst) { + ret = readl_relaxed_poll_timeout_atomic(reset->prm->base + reset->prm->data->rstst, v, v & BIT(st_bit), 1, OMAP_RESET_MAX_WAIT); - if (ret) - pr_err("%s: timedout waiting for %s:%lu\n", __func__, - reset->prm->data->name, id); + if (ret) + pr_err("%s: timedout waiting for %s:%lu\n", __func__, + reset->prm->data->name, id); + } -exit: - if (reset->clkdm) { - /* At least dra7 iva needs a delay before clkdm idle */ - if (has_rstst) - udelay(1); + if (reset->clkdm) pdata->clkdm_allow_idle(reset->clkdm); - } return ret; } diff --git a/drivers/spi/spi-altera-dfl.c b/drivers/spi/spi-altera-dfl.c index 44fc9ee13fc7..ca40923258af 100644 --- a/drivers/spi/spi-altera-dfl.c +++ b/drivers/spi/spi-altera-dfl.c @@ -134,7 +134,7 @@ static int dfl_spi_altera_probe(struct dfl_device *dfl_dev) if (!master) return -ENOMEM; - master->bus_num = dfl_dev->id; + master->bus_num = -1; hw = spi_master_get_devdata(master); diff --git a/drivers/spi/spi-altera-platform.c b/drivers/spi/spi-altera-platform.c index f7a7c14e3679..65147aae82a1 100644 --- a/drivers/spi/spi-altera-platform.c +++ b/drivers/spi/spi-altera-platform.c @@ -48,7 +48,7 @@ static int altera_spi_probe(struct platform_device *pdev) return err; /* setup the master state. */ - master->bus_num = pdev->id; + master->bus_num = -1; if (pdata) { if (pdata->num_chipselect > ALTERA_SPI_MAX_CS) { diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c index 788dcdf25f00..f872cf196c2f 100644 --- a/drivers/spi/spi-atmel.c +++ b/drivers/spi/spi-atmel.c @@ -1301,7 +1301,7 @@ static int atmel_spi_one_transfer(struct spi_master *master, * DMA map early, for performance (empties dcache ASAP) and * better fault reporting. */ - if ((!master->cur_msg_mapped) + if ((!master->cur_msg->is_dma_mapped) && as->use_pdc) { if (atmel_spi_dma_map_xfer(as, xfer) < 0) return -ENOMEM; @@ -1381,7 +1381,7 @@ static int atmel_spi_one_transfer(struct spi_master *master, } } - if (!master->cur_msg_mapped + if (!master->cur_msg->is_dma_mapped && as->use_pdc) atmel_spi_dma_unmap_xfer(master, xfer); diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c index a78e56f566dd..3043677ba222 100644 --- a/drivers/spi/spi-bcm-qspi.c +++ b/drivers/spi/spi-bcm-qspi.c @@ -1250,10 +1250,14 @@ static void bcm_qspi_hw_init(struct bcm_qspi *qspi) static void bcm_qspi_hw_uninit(struct bcm_qspi *qspi) { + u32 status = bcm_qspi_read(qspi, MSPI, MSPI_MSPI_STATUS); + bcm_qspi_write(qspi, MSPI, MSPI_SPCR2, 0); if (has_bspi(qspi)) bcm_qspi_write(qspi, MSPI, MSPI_WRITE_LOCK, 0); + /* clear interrupt */ + bcm_qspi_write(qspi, MSPI, MSPI_MSPI_STATUS, status & ~1); } static const struct spi_controller_mem_ops bcm_qspi_mem_ops = { @@ -1397,6 +1401,47 @@ int bcm_qspi_probe(struct platform_device *pdev, if (!qspi->dev_ids) return -ENOMEM; + /* + * Some SoCs integrate spi controller (e.g., its interrupt bits) + * in specific ways + */ + if (soc_intc) { + qspi->soc_intc = soc_intc; + soc_intc->bcm_qspi_int_set(soc_intc, MSPI_DONE, true); + } else { + qspi->soc_intc = NULL; + } + + if (qspi->clk) { + ret = clk_prepare_enable(qspi->clk); + if (ret) { + dev_err(dev, "failed to prepare clock\n"); + goto qspi_probe_err; + } + qspi->base_clk = clk_get_rate(qspi->clk); + } else { + qspi->base_clk = MSPI_BASE_FREQ; + } + + if (data->has_mspi_rev) { + rev = bcm_qspi_read(qspi, MSPI, MSPI_REV); + /* some older revs do not have a MSPI_REV register */ + if ((rev & 0xff) == 0xff) + rev = 0; + } + + qspi->mspi_maj_rev = (rev >> 4) & 0xf; + qspi->mspi_min_rev = rev & 0xf; + qspi->mspi_spcr3_sysclk = data->has_spcr3_sysclk; + + qspi->max_speed_hz = qspi->base_clk / (bcm_qspi_spbr_min(qspi) * 2); + + /* + * On SW resets it is possible to have the mask still enabled + * Need to disable the mask and clear the status while we init + */ + bcm_qspi_hw_uninit(qspi); + for (val = 0; val < num_irqs; val++) { irq = -1; name = qspi_irq_tab[val].irq_name; @@ -1433,38 +1478,6 @@ int bcm_qspi_probe(struct platform_device *pdev, goto qspi_probe_err; } - /* - * Some SoCs integrate spi controller (e.g., its interrupt bits) - * in specific ways - */ - if (soc_intc) { - qspi->soc_intc = soc_intc; - soc_intc->bcm_qspi_int_set(soc_intc, MSPI_DONE, true); - } else { - qspi->soc_intc = NULL; - } - - ret = clk_prepare_enable(qspi->clk); - if (ret) { - dev_err(dev, "failed to prepare clock\n"); - goto qspi_probe_err; - } - - qspi->base_clk = clk_get_rate(qspi->clk); - - if (data->has_mspi_rev) { - rev = bcm_qspi_read(qspi, MSPI, MSPI_REV); - /* some older revs do not have a MSPI_REV register */ - if ((rev & 0xff) == 0xff) - rev = 0; - } - - qspi->mspi_maj_rev = (rev >> 4) & 0xf; - qspi->mspi_min_rev = rev & 0xf; - qspi->mspi_spcr3_sysclk = data->has_spcr3_sysclk; - - qspi->max_speed_hz = qspi->base_clk / (bcm_qspi_spbr_min(qspi) * 2); - bcm_qspi_hw_init(qspi); init_completion(&qspi->mspi_done); init_completion(&qspi->bspi_done); diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index 386e8c84be0a..a15de10ee286 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -233,36 +233,44 @@ static int mtk_spi_set_hw_cs_timing(struct spi_device *spi) return delay; inactive = (delay * DIV_ROUND_UP(mdata->spi_clk_hz, 1000000)) / 1000; - setup = setup ? setup : 1; - hold = hold ? hold : 1; - inactive = inactive ? inactive : 1; - - reg_val = readl(mdata->base + SPI_CFG0_REG); - if (mdata->dev_comp->enhance_timing) { - hold = min_t(u32, hold, 0x10000); - setup = min_t(u32, setup, 0x10000); - reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); - reg_val |= (((hold - 1) & 0xffff) - << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); - reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); - reg_val |= (((setup - 1) & 0xffff) - << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); - } else { - hold = min_t(u32, hold, 0x100); - setup = min_t(u32, setup, 0x100); - reg_val &= ~(0xff << SPI_CFG0_CS_HOLD_OFFSET); - reg_val |= (((hold - 1) & 0xff) << SPI_CFG0_CS_HOLD_OFFSET); - reg_val &= ~(0xff << SPI_CFG0_CS_SETUP_OFFSET); - reg_val |= (((setup - 1) & 0xff) - << SPI_CFG0_CS_SETUP_OFFSET); + if (hold || setup) { + reg_val = readl(mdata->base + SPI_CFG0_REG); + if (mdata->dev_comp->enhance_timing) { + if (hold) { + hold = min_t(u32, hold, 0x10000); + reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); + reg_val |= (((hold - 1) & 0xffff) + << SPI_ADJUST_CFG0_CS_HOLD_OFFSET); + } + if (setup) { + setup = min_t(u32, setup, 0x10000); + reg_val &= ~(0xffff << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); + reg_val |= (((setup - 1) & 0xffff) + << SPI_ADJUST_CFG0_CS_SETUP_OFFSET); + } + } else { + if (hold) { + hold = min_t(u32, hold, 0x100); + reg_val &= ~(0xff << SPI_CFG0_CS_HOLD_OFFSET); + reg_val |= (((hold - 1) & 0xff) << SPI_CFG0_CS_HOLD_OFFSET); + } + if (setup) { + setup = min_t(u32, setup, 0x100); + reg_val &= ~(0xff << SPI_CFG0_CS_SETUP_OFFSET); + reg_val |= (((setup - 1) & 0xff) + << SPI_CFG0_CS_SETUP_OFFSET); + } + } + writel(reg_val, mdata->base + SPI_CFG0_REG); } - writel(reg_val, mdata->base + SPI_CFG0_REG); - inactive = min_t(u32, inactive, 0x100); - reg_val = readl(mdata->base + SPI_CFG1_REG); - reg_val &= ~SPI_CFG1_CS_IDLE_MASK; - reg_val |= (((inactive - 1) & 0xff) << SPI_CFG1_CS_IDLE_OFFSET); - writel(reg_val, mdata->base + SPI_CFG1_REG); + if (inactive) { + inactive = min_t(u32, inactive, 0x100); + reg_val = readl(mdata->base + SPI_CFG1_REG); + reg_val &= ~SPI_CFG1_CS_IDLE_MASK; + reg_val |= (((inactive - 1) & 0xff) << SPI_CFG1_CS_IDLE_OFFSET); + writel(reg_val, mdata->base + SPI_CFG1_REG); + } return 0; } diff --git a/drivers/spi/spi-mux.c b/drivers/spi/spi-mux.c index 9708b7827ff7..f5d32ec4634e 100644 --- a/drivers/spi/spi-mux.c +++ b/drivers/spi/spi-mux.c @@ -137,6 +137,13 @@ static int spi_mux_probe(struct spi_device *spi) priv = spi_controller_get_devdata(ctlr); priv->spi = spi; + /* + * Increase lockdep class as these lock are taken while the parent bus + * already holds their instance's lock. + */ + lockdep_set_subclass(&ctlr->io_mutex, 1); + lockdep_set_subclass(&ctlr->add_lock, 1); + priv->mux = devm_mux_control_get(&spi->dev, NULL); if (IS_ERR(priv->mux)) { ret = dev_err_probe(&spi->dev, PTR_ERR(priv->mux), diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index a66fa97046ee..2b0301fc971c 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -33,6 +33,7 @@ #include <linux/acpi.h> #include <linux/bitops.h> +#include <linux/bitfield.h> #include <linux/clk.h> #include <linux/completion.h> #include <linux/delay.h> @@ -315,6 +316,7 @@ #define NXP_FSPI_MIN_IOMAP SZ_4M #define DCFG_RCWSR1 0x100 +#define SYS_PLL_RAT GENMASK(6, 2) /* Access flash memory using IP bus only */ #define FSPI_QUIRK_USE_IP_ONLY BIT(0) @@ -926,9 +928,8 @@ static void erratum_err050568(struct nxp_fspi *f) { .family = "QorIQ LS1028A" }, { /* sentinel */ } }; - struct device_node *np; struct regmap *map; - u32 val = 0, sysclk = 0; + u32 val, sys_pll_ratio; int ret; /* Check for LS1028A family */ @@ -937,7 +938,6 @@ static void erratum_err050568(struct nxp_fspi *f) return; } - /* Compute system clock frequency multiplier ratio */ map = syscon_regmap_lookup_by_compatible("fsl,ls1028a-dcfg"); if (IS_ERR(map)) { dev_err(f->dev, "No syscon regmap\n"); @@ -948,23 +948,11 @@ static void erratum_err050568(struct nxp_fspi *f) if (ret < 0) goto err; - /* Strap bits 6:2 define SYS_PLL_RAT i.e frequency multiplier ratio */ - val = (val >> 2) & 0x1F; - WARN(val == 0, "Strapping is zero: Cannot determine ratio"); + sys_pll_ratio = FIELD_GET(SYS_PLL_RAT, val); + dev_dbg(f->dev, "val: 0x%08x, sys_pll_ratio: %d\n", val, sys_pll_ratio); - /* Compute system clock frequency */ - np = of_find_node_by_name(NULL, "clock-sysclk"); - if (!np) - goto err; - - if (of_property_read_u32(np, "clock-frequency", &sysclk)) - goto err; - - sysclk = (sysclk * val) / 1000000; /* Convert sysclk to Mhz */ - dev_dbg(f->dev, "val: 0x%08x, sysclk: %dMhz\n", val, sysclk); - - /* Use IP bus only if PLL is 300MHz */ - if (sysclk == 300) + /* Use IP bus only if platform clock is 300MHz */ + if (sys_pll_ratio == 3) f->devtype_data->quirks |= FSPI_QUIRK_USE_IP_ONLY; return; diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index feebda66f56e..e4484ace584e 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -1716,12 +1716,13 @@ static int verify_controller_parameters(struct pl022 *pl022, return -EINVAL; } } else { - if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX) + if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX) { dev_err(&pl022->adev->dev, "Microwire half duplex mode requested," " but this is only available in the" " ST version of PL022\n"); - return -EINVAL; + return -EINVAL; + } } } return 0; diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c index 8ce840c7ecc8..3226c4e1c7c0 100644 --- a/drivers/spi/spi-tegra20-slink.c +++ b/drivers/spi/spi-tegra20-slink.c @@ -1182,8 +1182,7 @@ static int tegra_slink_resume(struct device *dev) } #endif -#ifdef CONFIG_PM -static int tegra_slink_runtime_suspend(struct device *dev) +static int __maybe_unused tegra_slink_runtime_suspend(struct device *dev) { struct spi_master *master = dev_get_drvdata(dev); struct tegra_slink_data *tspi = spi_master_get_devdata(master); @@ -1195,7 +1194,7 @@ static int tegra_slink_runtime_suspend(struct device *dev) return 0; } -static int tegra_slink_runtime_resume(struct device *dev) +static int __maybe_unused tegra_slink_runtime_resume(struct device *dev) { struct spi_master *master = dev_get_drvdata(dev); struct tegra_slink_data *tspi = spi_master_get_devdata(master); @@ -1208,7 +1207,6 @@ static int tegra_slink_runtime_resume(struct device *dev) } return 0; } -#endif /* CONFIG_PM */ static const struct dev_pm_ops slink_pm_ops = { SET_RUNTIME_PM_OPS(tegra_slink_runtime_suspend, diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index aea037c65985..926b68aa45d3 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -478,12 +478,6 @@ static LIST_HEAD(spi_controller_list); */ static DEFINE_MUTEX(board_lock); -/* - * Prevents addition of devices with same chip select and - * addition of devices below an unregistering controller. - */ -static DEFINE_MUTEX(spi_add_lock); - /** * spi_alloc_device - Allocate a new SPI device * @ctlr: Controller to which device is connected @@ -636,9 +630,9 @@ int spi_add_device(struct spi_device *spi) * chipselect **BEFORE** we call setup(), else we'll trash * its configuration. Lock against concurrent add() calls. */ - mutex_lock(&spi_add_lock); + mutex_lock(&ctlr->add_lock); status = __spi_add_device(spi); - mutex_unlock(&spi_add_lock); + mutex_unlock(&ctlr->add_lock); return status; } EXPORT_SYMBOL_GPL(spi_add_device); @@ -658,7 +652,7 @@ static int spi_add_device_locked(struct spi_device *spi) /* Set the bus ID string */ spi_dev_set_name(spi); - WARN_ON(!mutex_is_locked(&spi_add_lock)); + WARN_ON(!mutex_is_locked(&ctlr->add_lock)); return __spi_add_device(spi); } @@ -2553,6 +2547,12 @@ struct spi_controller *__spi_alloc_controller(struct device *dev, return NULL; device_initialize(&ctlr->dev); + INIT_LIST_HEAD(&ctlr->queue); + spin_lock_init(&ctlr->queue_lock); + spin_lock_init(&ctlr->bus_lock_spinlock); + mutex_init(&ctlr->bus_lock_mutex); + mutex_init(&ctlr->io_mutex); + mutex_init(&ctlr->add_lock); ctlr->bus_num = -1; ctlr->num_chipselect = 1; ctlr->slave = slave; @@ -2825,11 +2825,6 @@ int spi_register_controller(struct spi_controller *ctlr) return id; ctlr->bus_num = id; } - INIT_LIST_HEAD(&ctlr->queue); - spin_lock_init(&ctlr->queue_lock); - spin_lock_init(&ctlr->bus_lock_spinlock); - mutex_init(&ctlr->bus_lock_mutex); - mutex_init(&ctlr->io_mutex); ctlr->bus_lock_flag = 0; init_completion(&ctlr->xfer_completion); if (!ctlr->max_dma_len) @@ -2966,7 +2961,7 @@ void spi_unregister_controller(struct spi_controller *ctlr) /* Prevent addition of new devices, unregister existing ones */ if (IS_ENABLED(CONFIG_SPI_DYNAMIC)) - mutex_lock(&spi_add_lock); + mutex_lock(&ctlr->add_lock); device_for_each_child(&ctlr->dev, NULL, __unregister); @@ -2997,7 +2992,7 @@ void spi_unregister_controller(struct spi_controller *ctlr) mutex_unlock(&board_lock); if (IS_ENABLED(CONFIG_SPI_DYNAMIC)) - mutex_unlock(&spi_add_lock); + mutex_unlock(&ctlr->add_lock); } EXPORT_SYMBOL_GPL(spi_unregister_controller); diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index 6dc29ce3b4bf..1bd73e322b7b 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -673,6 +673,19 @@ static const struct file_operations spidev_fops = { static struct class *spidev_class; +static const struct spi_device_id spidev_spi_ids[] = { + { .name = "dh2228fv" }, + { .name = "ltc2488" }, + { .name = "sx1301" }, + { .name = "bk4" }, + { .name = "dhcom-board" }, + { .name = "m53cpld" }, + { .name = "spi-petra" }, + { .name = "spi-authenta" }, + {}, +}; +MODULE_DEVICE_TABLE(spi, spidev_spi_ids); + #ifdef CONFIG_OF static const struct of_device_id spidev_dt_ids[] = { { .compatible = "rohm,dh2228fv" }, @@ -818,6 +831,7 @@ static struct spi_driver spidev_spi_driver = { }, .probe = spidev_probe, .remove = spidev_remove, + .id_table = spidev_spi_ids, /* NOTE: suspend/resume methods are not necessary here. * We don't do anything except pass the requests to/from diff --git a/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c b/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c index 8e085dda0c18..712e01c37870 100644 --- a/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c +++ b/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/input_system.c @@ -1646,6 +1646,8 @@ static input_system_err_t input_system_configure_channel_sensor( default: return INPUT_SYSTEM_ERR_PARAMETER_NOT_SUPPORTED; } + + return INPUT_SYSTEM_ERR_NO_ERROR; } // Test flags and set structure. diff --git a/drivers/staging/r8188eu/hal/hal_intf.c b/drivers/staging/r8188eu/hal/hal_intf.c index a6d589e89aeb..f27eba72d646 100644 --- a/drivers/staging/r8188eu/hal/hal_intf.c +++ b/drivers/staging/r8188eu/hal/hal_intf.c @@ -248,7 +248,7 @@ void rtw_hal_update_ra_mask(struct adapter *adapt, u32 mac_id, u8 rssi_level) #ifdef CONFIG_88EU_AP_MODE struct sta_info *psta = NULL; struct sta_priv *pstapriv = &adapt->stapriv; - if ((mac_id - 1) > 0) + if (mac_id >= 2) psta = pstapriv->sta_aid[(mac_id - 1) - 1]; if (psta) add_RATid(adapt, psta, 0);/* todo: based on rssi_level*/ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index b25369a13452..967f10b9582a 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -182,7 +182,7 @@ create_pagelist(char *buf, char __user *ubuf, offset = (uintptr_t)ubuf & (PAGE_SIZE - 1); num_pages = DIV_ROUND_UP(count + offset, PAGE_SIZE); - if (num_pages > (SIZE_MAX - sizeof(struct pagelist) - + if ((size_t)num_pages > (SIZE_MAX - sizeof(struct pagelist) - sizeof(struct vchiq_pagelist_info)) / (sizeof(u32) + sizeof(pages[0]) + sizeof(struct scatterlist))) diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index ef4a8e189fba..8190b840065f 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -20,6 +20,7 @@ #include <linux/vmalloc.h> #include <linux/falloc.h> #include <linux/uio.h> +#include <linux/scatterlist.h> #include <scsi/scsi_proto.h> #include <asm/unaligned.h> @@ -244,7 +245,7 @@ struct target_core_file_cmd { struct bio_vec bvecs[]; }; -static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) +static void cmd_rw_aio_complete(struct kiocb *iocb, long ret) { struct target_core_file_cmd *cmd; @@ -302,7 +303,7 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, ret = call_read_iter(file, &aio_cmd->iocb, &iter); if (ret != -EIOCBQUEUED) - cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0); + cmd_rw_aio_complete(&aio_cmd->iocb, ret); return 0; } diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 4069a1edcfa3..b1ef041cacd8 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -16,12 +16,14 @@ #include <linux/timer.h> #include <linux/fs.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/bio.h> #include <linux/genhd.h> #include <linux/file.h> #include <linux/module.h> +#include <linux/scatterlist.h> #include <scsi/scsi_proto.h> #include <asm/unaligned.h> @@ -230,9 +232,9 @@ static unsigned long long iblock_emulate_read_cap_with_block_size( struct block_device *bd, struct request_queue *q) { - unsigned long long blocks_long = (div_u64(i_size_read(bd->bd_inode), - bdev_logical_block_size(bd)) - 1); u32 block_size = bdev_logical_block_size(bd); + unsigned long long blocks_long = + div_u64(bdev_nr_bytes(bd), block_size) - 1; if (block_size == dev->dev_attrib.block_size) return blocks_long; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 75ef52f008ff..7fa57fb57bf2 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -980,11 +980,10 @@ pscsi_execute_cmd(struct se_cmd *cmd) memcpy(pt->pscsi_cdb, cmd->t_task_cdb, scsi_command_size(cmd->t_task_cdb)); - req = blk_get_request(pdv->pdv_sd->request_queue, + req = scsi_alloc_request(pdv->pdv_sd->request_queue, cmd->data_direction == DMA_TO_DEVICE ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(req)) { - pr_err("PSCSI: blk_get_request() failed\n"); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto fail; } @@ -1012,7 +1011,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) return 0; fail_put_request: - blk_put_request(req); + blk_mq_free_request(req); fail: kfree(pt); return ret; @@ -1067,7 +1066,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status) break; } - blk_put_request(req); + blk_mq_free_request(req); kfree(pt); } diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index 5ce13b099d7d..5363ebebfc35 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -585,6 +585,9 @@ static int optee_remove(struct platform_device *pdev) { struct optee *optee = platform_get_drvdata(pdev); + /* Unregister OP-TEE specific client devices on TEE bus */ + optee_unregister_devices(); + /* * Ask OP-TEE to free all cached shared memory objects to decrease * reference counters and also avoid wild pointers in secure world diff --git a/drivers/tee/optee/device.c b/drivers/tee/optee/device.c index ec1d24693eba..128a2d2a50a1 100644 --- a/drivers/tee/optee/device.c +++ b/drivers/tee/optee/device.c @@ -53,6 +53,13 @@ static int get_devices(struct tee_context *ctx, u32 session, return 0; } +static void optee_release_device(struct device *dev) +{ + struct tee_client_device *optee_device = to_tee_client_device(dev); + + kfree(optee_device); +} + static int optee_register_device(const uuid_t *device_uuid) { struct tee_client_device *optee_device = NULL; @@ -63,6 +70,7 @@ static int optee_register_device(const uuid_t *device_uuid) return -ENOMEM; optee_device->dev.bus = &tee_bus_type; + optee_device->dev.release = optee_release_device; if (dev_set_name(&optee_device->dev, "optee-ta-%pUb", device_uuid)) { kfree(optee_device); return -ENOMEM; @@ -154,3 +162,17 @@ int optee_enumerate_devices(u32 func) { return __optee_enumerate_devices(func); } + +static int __optee_unregister_device(struct device *dev, void *data) +{ + if (!strncmp(dev_name(dev), "optee-ta", strlen("optee-ta"))) + device_unregister(dev); + + return 0; +} + +void optee_unregister_devices(void) +{ + bus_for_each_dev(&tee_bus_type, NULL, NULL, + __optee_unregister_device); +} diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h index dbdd367be156..f6bb4a763ba9 100644 --- a/drivers/tee/optee/optee_private.h +++ b/drivers/tee/optee/optee_private.h @@ -184,6 +184,7 @@ void optee_fill_pages_list(u64 *dst, struct page **pages, int num_pages, #define PTA_CMD_GET_DEVICES 0x0 #define PTA_CMD_GET_DEVICES_SUPP 0x1 int optee_enumerate_devices(u32 func); +void optee_unregister_devices(void); /* * Small helpers diff --git a/drivers/tee/optee/shm_pool.c b/drivers/tee/optee/shm_pool.c index c41a9a501a6e..d167039af519 100644 --- a/drivers/tee/optee/shm_pool.c +++ b/drivers/tee/optee/shm_pool.c @@ -35,7 +35,7 @@ static int pool_op_alloc(struct tee_shm_pool_mgr *poolm, unsigned int nr_pages = 1 << order, i; struct page **pages; - pages = kcalloc(nr_pages, sizeof(pages), GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) { rc = -ENOMEM; goto err; diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile index da19d7987d00..78fd365893c1 100644 --- a/drivers/thunderbolt/Makefile +++ b/drivers/thunderbolt/Makefile @@ -7,6 +7,7 @@ thunderbolt-objs += usb4_port.o nvm.o retimer.o quirks.o thunderbolt-${CONFIG_ACPI} += acpi.o thunderbolt-$(CONFIG_DEBUG_FS) += debugfs.o thunderbolt-${CONFIG_USB4_KUNIT_TEST} += test.o +CFLAGS_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) thunderbolt_dma_test-${CONFIG_USB4_DMA_TEST} += dma_test.o obj-$(CONFIG_USB4_DMA_TEST) += thunderbolt_dma_test.o diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 8f143c09a169..f0bf01ea069a 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -618,10 +618,8 @@ static int __init xenboot_console_setup(struct console *console, char *string) { static struct xencons_info xenboot; - if (xen_initial_domain()) + if (xen_initial_domain() || !xen_pv_domain()) return 0; - if (!xen_pv_domain()) - return -ENODEV; return xencons_info_pv_init(&xenboot, 0); } @@ -632,17 +630,16 @@ static void xenboot_write_console(struct console *console, const char *string, unsigned int linelen, off = 0; const char *pos; + if (dom0_write_console(0, string, len) >= 0) + return; + if (!xen_pv_domain()) { xen_hvm_early_write(0, string, len); return; } - dom0_write_console(0, string, len); - - if (xen_initial_domain()) + if (domU_write_console(0, "(early) ", 8) < 0) return; - - domU_write_console(0, "(early) ", 8); while (off < len && NULL != (pos = strchr(string+off, '\n'))) { linelen = pos-string+off; if (off + linelen > len) diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig index 71ae16de0f90..39fc96dc2531 100644 --- a/drivers/tty/serial/8250/Kconfig +++ b/drivers/tty/serial/8250/Kconfig @@ -361,9 +361,13 @@ config SERIAL_8250_BCM2835AUX If unsure, say N. config SERIAL_8250_FSL - bool + bool "Freescale 16550 UART support" if COMPILE_TEST && !(PPC || ARM || ARM64) depends on SERIAL_8250_CONSOLE - default PPC || ARM || ARM64 || COMPILE_TEST + default PPC || ARM || ARM64 + help + Selecting this option enables a workaround for a break-detection + erratum for Freescale 16550 UARTs in the 8250 driver. It also + enables support for ACPI enumeration. config SERIAL_8250_DW tristate "Support for Synopsys DesignWare 8250 quirks" diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c index 8b7bc10b6e8b..f1d100671ee6 100644 --- a/drivers/usb/chipidea/ci_hdrc_imx.c +++ b/drivers/usb/chipidea/ci_hdrc_imx.c @@ -420,11 +420,16 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) data->phy = devm_usb_get_phy_by_phandle(dev, "fsl,usbphy", 0); if (IS_ERR(data->phy)) { ret = PTR_ERR(data->phy); - /* Return -EINVAL if no usbphy is available */ - if (ret == -ENODEV) - data->phy = NULL; - else - goto err_clk; + if (ret == -ENODEV) { + data->phy = devm_usb_get_phy_by_phandle(dev, "phys", 0); + if (IS_ERR(data->phy)) { + ret = PTR_ERR(data->phy); + if (ret == -ENODEV) + data->phy = NULL; + else + goto err_clk; + } + } } pdata.usb_phy = data->phy; diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 4e2f1552f4b7..7b2e2420ecae 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -340,6 +340,9 @@ static void acm_process_notification(struct acm *acm, unsigned char *buf) acm->iocount.overrun++; spin_unlock_irqrestore(&acm->read_lock, flags); + if (newctrl & ACM_CTRL_BRK) + tty_flip_buffer_push(&acm->port); + if (difference) wake_up_all(&acm->wioctl); @@ -475,11 +478,16 @@ static int acm_submit_read_urbs(struct acm *acm, gfp_t mem_flags) static void acm_process_read_urb(struct acm *acm, struct urb *urb) { + unsigned long flags; + if (!urb->actual_length) return; + spin_lock_irqsave(&acm->read_lock, flags); tty_insert_flip_string(&acm->port, urb->transfer_buffer, urb->actual_length); + spin_unlock_irqrestore(&acm->read_lock, flags); + tty_flip_buffer_push(&acm->port); } diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 35d5908b5478..fdf79bcf7eb0 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -824,7 +824,7 @@ static struct usb_class_driver wdm_class = { }; /* --- WWAN framework integration --- */ -#ifdef CONFIG_WWAN_CORE +#ifdef CONFIG_WWAN static int wdm_wwan_port_start(struct wwan_port *port) { struct wdm_device *desc = wwan_port_get_drvdata(port); @@ -963,11 +963,11 @@ static void wdm_wwan_rx(struct wdm_device *desc, int length) /* inbuf has been copied, it is safe to check for outstanding data */ schedule_work(&desc->service_outs_intr); } -#else /* CONFIG_WWAN_CORE */ +#else /* CONFIG_WWAN */ static void wdm_wwan_init(struct wdm_device *desc) {} static void wdm_wwan_deinit(struct wdm_device *desc) {} static void wdm_wwan_rx(struct wdm_device *desc, int length) {} -#endif /* CONFIG_WWAN_CORE */ +#endif /* CONFIG_WWAN */ /* --- error handling --- */ static void wdm_rxwork(struct work_struct *work) diff --git a/drivers/usb/common/Kconfig b/drivers/usb/common/Kconfig index 5e8a04e3dd3c..b856622431a7 100644 --- a/drivers/usb/common/Kconfig +++ b/drivers/usb/common/Kconfig @@ -6,8 +6,7 @@ config USB_COMMON config USB_LED_TRIG bool "USB LED Triggers" - depends on LEDS_CLASS && LEDS_TRIGGERS - select USB_COMMON + depends on LEDS_CLASS && USB_COMMON && LEDS_TRIGGERS help This option adds LED triggers for USB host and/or gadget activity. diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 804b50548163..4519d06c9ca2 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4243,7 +4243,7 @@ int dwc3_gadget_init(struct dwc3 *dwc) } - usb_initialize_gadget(dwc->sysdev, dwc->gadget, dwc_gadget_release); + usb_initialize_gadget(dwc->dev, dwc->gadget, dwc_gadget_release); dev = &dwc->gadget->dev; dev->platform_data = dwc; dwc->gadget->ops = &dwc3_gadget_ops; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 8260f38025b7..e20c19a0f106 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -831,7 +831,7 @@ static void ffs_user_copy_worker(struct work_struct *work) kthread_unuse_mm(io_data->mm); } - io_data->kiocb->ki_complete(io_data->kiocb, ret, ret); + io_data->kiocb->ki_complete(io_data->kiocb, ret); if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd) eventfd_signal(io_data->ffs->ffs_eventfd, 1); diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index be864560bfea..ef55b8bb5870 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -674,11 +674,17 @@ static int set_ep_max_packet_size(const struct f_uac2_opts *uac2_opts, ssize = uac2_opts->c_ssize; } - if (!is_playback && (uac2_opts->c_sync == USB_ENDPOINT_SYNC_ASYNC)) + if (!is_playback && (uac2_opts->c_sync == USB_ENDPOINT_SYNC_ASYNC)) { + // Win10 requires max packet size + 1 frame srate = srate * (1000 + uac2_opts->fb_max) / 1000; - - max_size_bw = num_channels(chmask) * ssize * - DIV_ROUND_UP(srate, factor / (1 << (ep_desc->bInterval - 1))); + // updated srate is always bigger, therefore DIV_ROUND_UP always yields +1 + max_size_bw = num_channels(chmask) * ssize * + (DIV_ROUND_UP(srate, factor / (1 << (ep_desc->bInterval - 1)))); + } else { + // adding 1 frame provision for Win10 + max_size_bw = num_channels(chmask) * ssize * + (DIV_ROUND_UP(srate, factor / (1 << (ep_desc->bInterval - 1))) + 1); + } ep_desc->wMaxPacketSize = cpu_to_le16(min_t(u16, max_size_bw, max_size_ep)); diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 539220d7f5b6..78be94750232 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -469,7 +469,7 @@ static void ep_user_copy_worker(struct work_struct *work) ret = -EFAULT; /* completing the iocb can drop the ctx and mm, don't touch mm after */ - iocb->ki_complete(iocb, ret, ret); + iocb->ki_complete(iocb, ret); kfree(priv->buf); kfree(priv->to_free); @@ -496,11 +496,8 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) kfree(priv->to_free); kfree(priv); iocb->private = NULL; - /* aio_complete() reports bytes-transferred _and_ faults */ - iocb->ki_complete(iocb, - req->actual ? req->actual : (long)req->status, - req->status); + req->actual ? req->actual : (long)req->status); } else { /* ep_copy_to_user() won't report both; we hide some faults */ if (unlikely(0 != req->status)) diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c index 0b3722770760..ded9738392e4 100644 --- a/drivers/usb/host/ohci-omap.c +++ b/drivers/usb/host/ohci-omap.c @@ -40,17 +40,6 @@ #include <mach/usb.h> -/* OMAP-1510 OHCI has its own MMU for DMA */ -#define OMAP1510_LB_MEMSIZE 32 /* Should be same as SDRAM size */ -#define OMAP1510_LB_CLOCK_DIV 0xfffec10c -#define OMAP1510_LB_MMU_CTL 0xfffec208 -#define OMAP1510_LB_MMU_LCK 0xfffec224 -#define OMAP1510_LB_MMU_LD_TLB 0xfffec228 -#define OMAP1510_LB_MMU_CAM_H 0xfffec22c -#define OMAP1510_LB_MMU_CAM_L 0xfffec230 -#define OMAP1510_LB_MMU_RAM_H 0xfffec234 -#define OMAP1510_LB_MMU_RAM_L 0xfffec238 - #define DRIVER_DESC "OHCI OMAP driver" struct ohci_omap_priv { @@ -104,61 +93,6 @@ static int omap_ohci_transceiver_power(struct ohci_omap_priv *priv, int on) return 0; } -#ifdef CONFIG_ARCH_OMAP15XX -/* - * OMAP-1510 specific Local Bus clock on/off - */ -static int omap_1510_local_bus_power(int on) -{ - if (on) { - omap_writel((1 << 1) | (1 << 0), OMAP1510_LB_MMU_CTL); - udelay(200); - } else { - omap_writel(0, OMAP1510_LB_MMU_CTL); - } - - return 0; -} - -/* - * OMAP-1510 specific Local Bus initialization - * NOTE: This assumes 32MB memory size in OMAP1510LB_MEMSIZE. - * See also arch/mach-omap/memory.h for __virt_to_dma() and - * __dma_to_virt() which need to match with the physical - * Local Bus address below. - */ -static int omap_1510_local_bus_init(void) -{ - unsigned int tlb; - unsigned long lbaddr, physaddr; - - omap_writel((omap_readl(OMAP1510_LB_CLOCK_DIV) & 0xfffffff8) | 0x4, - OMAP1510_LB_CLOCK_DIV); - - /* Configure the Local Bus MMU table */ - for (tlb = 0; tlb < OMAP1510_LB_MEMSIZE; tlb++) { - lbaddr = tlb * 0x00100000 + OMAP1510_LB_OFFSET; - physaddr = tlb * 0x00100000 + PHYS_OFFSET; - omap_writel((lbaddr & 0x0fffffff) >> 22, OMAP1510_LB_MMU_CAM_H); - omap_writel(((lbaddr & 0x003ffc00) >> 6) | 0xc, - OMAP1510_LB_MMU_CAM_L); - omap_writel(physaddr >> 16, OMAP1510_LB_MMU_RAM_H); - omap_writel((physaddr & 0x0000fc00) | 0x300, OMAP1510_LB_MMU_RAM_L); - omap_writel(tlb << 4, OMAP1510_LB_MMU_LCK); - omap_writel(0x1, OMAP1510_LB_MMU_LD_TLB); - } - - /* Enable the walking table */ - omap_writel(omap_readl(OMAP1510_LB_MMU_CTL) | (1 << 3), OMAP1510_LB_MMU_CTL); - udelay(200); - - return 0; -} -#else -#define omap_1510_local_bus_power(x) {} -#define omap_1510_local_bus_init() {} -#endif - #ifdef CONFIG_USB_OTG static void start_hnp(struct ohci_hcd *ohci) @@ -229,10 +163,8 @@ static int ohci_omap_reset(struct usb_hcd *hcd) omap_ohci_clock_power(priv, 1); - if (cpu_is_omap15xx()) { - omap_1510_local_bus_power(1); - omap_1510_local_bus_init(); - } + if (config->lb_reset) + config->lb_reset(); ret = ohci_setup(hcd); if (ret < 0) diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c index 6e784f2fc26d..eb46e642e87a 100644 --- a/drivers/usb/host/xhci-dbgtty.c +++ b/drivers/usb/host/xhci-dbgtty.c @@ -408,40 +408,38 @@ static int xhci_dbc_tty_register_device(struct xhci_dbc *dbc) return -EBUSY; xhci_dbc_tty_init_port(dbc, port); - tty_dev = tty_port_register_device(&port->port, - dbc_tty_driver, 0, NULL); - if (IS_ERR(tty_dev)) { - ret = PTR_ERR(tty_dev); - goto register_fail; - } ret = kfifo_alloc(&port->write_fifo, DBC_WRITE_BUF_SIZE, GFP_KERNEL); if (ret) - goto buf_alloc_fail; + goto err_exit_port; ret = xhci_dbc_alloc_requests(dbc, BULK_IN, &port->read_pool, dbc_read_complete); if (ret) - goto request_fail; + goto err_free_fifo; ret = xhci_dbc_alloc_requests(dbc, BULK_OUT, &port->write_pool, dbc_write_complete); if (ret) - goto request_fail; + goto err_free_requests; + + tty_dev = tty_port_register_device(&port->port, + dbc_tty_driver, 0, NULL); + if (IS_ERR(tty_dev)) { + ret = PTR_ERR(tty_dev); + goto err_free_requests; + } port->registered = true; return 0; -request_fail: +err_free_requests: xhci_dbc_free_requests(&port->read_pool); xhci_dbc_free_requests(&port->write_pool); +err_free_fifo: kfifo_free(&port->write_fifo); - -buf_alloc_fail: - tty_unregister_device(dbc_tty_driver, 0); - -register_fail: +err_exit_port: xhci_dbc_tty_exit_port(port); dev_err(dbc->dev, "can't register tty port, err %d\n", ret); diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 2c9f25ca8edd..2484a9d38ce2 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -30,6 +30,7 @@ #define PCI_VENDOR_ID_FRESCO_LOGIC 0x1b73 #define PCI_DEVICE_ID_FRESCO_LOGIC_PDK 0x1000 #define PCI_DEVICE_ID_FRESCO_LOGIC_FL1009 0x1009 +#define PCI_DEVICE_ID_FRESCO_LOGIC_FL1100 0x1100 #define PCI_DEVICE_ID_FRESCO_LOGIC_FL1400 0x1400 #define PCI_VENDOR_ID_ETRON 0x1b6f @@ -113,6 +114,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) /* Look for vendor-specific quirks */ if (pdev->vendor == PCI_VENDOR_ID_FRESCO_LOGIC && (pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_PDK || + pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_FL1100 || pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_FL1400)) { if (pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_PDK && pdev->revision == 0x0) { @@ -279,8 +281,10 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) pdev->device == 0x3432) xhci->quirks |= XHCI_BROKEN_STREAMS; - if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483) + if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483) { xhci->quirks |= XHCI_LPM_SUPPORT; + xhci->quirks |= XHCI_EP_CTX_BROKEN_DCS; + } if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA && pdev->device == PCI_DEVICE_ID_ASMEDIA_1042_XHCI) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index e676749f543b..311597bba80e 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -366,16 +366,22 @@ static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci, /* Must be called with xhci->lock held, releases and aquires lock back */ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags) { - u64 temp_64; + u32 temp_32; int ret; xhci_dbg(xhci, "Abort command ring\n"); reinit_completion(&xhci->cmd_ring_stop_completion); - temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); - xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, - &xhci->op_regs->cmd_ring); + /* + * The control bits like command stop, abort are located in lower + * dword of the command ring control register. Limit the write + * to the lower dword to avoid corrupting the command ring pointer + * in case if the command ring is stopped by the time upper dword + * is written. + */ + temp_32 = readl(&xhci->op_regs->cmd_ring); + writel(temp_32 | CMD_RING_ABORT, &xhci->op_regs->cmd_ring); /* Section 4.6.1.2 of xHCI 1.0 spec says software should also time the * completion of the Command Abort operation. If CRR is not negated in 5 @@ -559,8 +565,11 @@ static int xhci_move_dequeue_past_td(struct xhci_hcd *xhci, struct xhci_ring *ep_ring; struct xhci_command *cmd; struct xhci_segment *new_seg; + struct xhci_segment *halted_seg = NULL; union xhci_trb *new_deq; int new_cycle; + union xhci_trb *halted_trb; + int index = 0; dma_addr_t addr; u64 hw_dequeue; bool cycle_found = false; @@ -598,7 +607,27 @@ static int xhci_move_dequeue_past_td(struct xhci_hcd *xhci, hw_dequeue = xhci_get_hw_deq(xhci, dev, ep_index, stream_id); new_seg = ep_ring->deq_seg; new_deq = ep_ring->dequeue; - new_cycle = hw_dequeue & 0x1; + + /* + * Quirk: xHC write-back of the DCS field in the hardware dequeue + * pointer is wrong - use the cycle state of the TRB pointed to by + * the dequeue pointer. + */ + if (xhci->quirks & XHCI_EP_CTX_BROKEN_DCS && + !(ep->ep_state & EP_HAS_STREAMS)) + halted_seg = trb_in_td(xhci, td->start_seg, + td->first_trb, td->last_trb, + hw_dequeue & ~0xf, false); + if (halted_seg) { + index = ((dma_addr_t)(hw_dequeue & ~0xf) - halted_seg->dma) / + sizeof(*halted_trb); + halted_trb = &halted_seg->trbs[index]; + new_cycle = halted_trb->generic.field[3] & 0x1; + xhci_dbg(xhci, "Endpoint DCS = %d TRB index = %d cycle = %d\n", + (u8)(hw_dequeue & 0x1), index, new_cycle); + } else { + new_cycle = hw_dequeue & 0x1; + } /* * We want to find the pointer, segment and cycle state of the new trb diff --git a/drivers/usb/host/xhci-tegra.c b/drivers/usb/host/xhci-tegra.c index 575fa89a783f..1bf494b649bd 100644 --- a/drivers/usb/host/xhci-tegra.c +++ b/drivers/usb/host/xhci-tegra.c @@ -1787,7 +1787,6 @@ static int tegra_xusb_remove(struct platform_device *pdev) return 0; } -#if IS_ENABLED(CONFIG_PM) || IS_ENABLED(CONFIG_PM_SLEEP) static bool xhci_hub_ports_suspended(struct xhci_hub *hub) { struct device *dev = hub->hcd->self.controller; @@ -2102,7 +2101,7 @@ out: return err; } -static int tegra_xusb_suspend(struct device *dev) +static __maybe_unused int tegra_xusb_suspend(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int err; @@ -2144,7 +2143,7 @@ out: return err; } -static int tegra_xusb_resume(struct device *dev) +static __maybe_unused int tegra_xusb_resume(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int err; @@ -2174,10 +2173,8 @@ static int tegra_xusb_resume(struct device *dev) return 0; } -#endif -#ifdef CONFIG_PM -static int tegra_xusb_runtime_suspend(struct device *dev) +static __maybe_unused int tegra_xusb_runtime_suspend(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int ret; @@ -2190,7 +2187,7 @@ static int tegra_xusb_runtime_suspend(struct device *dev) return ret; } -static int tegra_xusb_runtime_resume(struct device *dev) +static __maybe_unused int tegra_xusb_runtime_resume(struct device *dev) { struct tegra_xusb *tegra = dev_get_drvdata(dev); int err; @@ -2201,7 +2198,6 @@ static int tegra_xusb_runtime_resume(struct device *dev) return err; } -#endif static const struct dev_pm_ops tegra_xusb_pm_ops = { SET_RUNTIME_PM_OPS(tegra_xusb_runtime_suspend, diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 93c38b557afd..541fe4dcc43a 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -3214,10 +3214,13 @@ static void xhci_endpoint_reset(struct usb_hcd *hcd, return; /* Bail out if toggle is already being cleared by a endpoint reset */ + spin_lock_irqsave(&xhci->lock, flags); if (ep->ep_state & EP_HARD_CLEAR_TOGGLE) { ep->ep_state &= ~EP_HARD_CLEAR_TOGGLE; + spin_unlock_irqrestore(&xhci->lock, flags); return; } + spin_unlock_irqrestore(&xhci->lock, flags); /* Only interrupt and bulk ep's use data toggle, USB2 spec 5.5.4-> */ if (usb_endpoint_xfer_control(&host_ep->desc) || usb_endpoint_xfer_isoc(&host_ep->desc)) @@ -3303,8 +3306,10 @@ static void xhci_endpoint_reset(struct usb_hcd *hcd, xhci_free_command(xhci, cfg_cmd); cleanup: xhci_free_command(xhci, stop_cmd); + spin_lock_irqsave(&xhci->lock, flags); if (ep->ep_state & EP_SOFT_CLEAR_TOGGLE) ep->ep_state &= ~EP_SOFT_CLEAR_TOGGLE; + spin_unlock_irqrestore(&xhci->lock, flags); } static int xhci_check_streams_endpoint(struct xhci_hcd *xhci, diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index dca6181c33fd..5a75fe563123 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1899,6 +1899,7 @@ struct xhci_hcd { #define XHCI_SG_TRB_CACHE_SIZE_QUIRK BIT_ULL(39) #define XHCI_NO_SOFT_RETRY BIT_ULL(40) #define XHCI_BROKEN_D3COLD BIT_ULL(41) +#define XHCI_EP_CTX_BROKEN_DCS BIT_ULL(42) unsigned int num_active_eps; unsigned int limit_active_eps; diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c index ce9fc46c9266..b5935834f9d2 100644 --- a/drivers/usb/musb/musb_dsps.c +++ b/drivers/usb/musb/musb_dsps.c @@ -899,11 +899,13 @@ static int dsps_probe(struct platform_device *pdev) if (usb_get_dr_mode(&pdev->dev) == USB_DR_MODE_PERIPHERAL) { ret = dsps_setup_optional_vbus_irq(pdev, glue); if (ret) - goto err; + goto unregister_pdev; } return 0; +unregister_pdev: + platform_device_unregister(glue->musb); err: pm_runtime_disable(&pdev->dev); iounmap(glue->usbss_base); diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 6cfb5d33609f..a484ff5e4ebf 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -246,11 +246,13 @@ static void option_instat_callback(struct urb *urb); /* These Quectel products use Quectel's vendor ID */ #define QUECTEL_PRODUCT_EC21 0x0121 #define QUECTEL_PRODUCT_EC25 0x0125 +#define QUECTEL_PRODUCT_EG91 0x0191 #define QUECTEL_PRODUCT_EG95 0x0195 #define QUECTEL_PRODUCT_BG96 0x0296 #define QUECTEL_PRODUCT_EP06 0x0306 #define QUECTEL_PRODUCT_EM12 0x0512 #define QUECTEL_PRODUCT_RM500Q 0x0800 +#define QUECTEL_PRODUCT_EC200S_CN 0x6002 #define QUECTEL_PRODUCT_EC200T 0x6026 #define CMOTECH_VENDOR_ID 0x16d8 @@ -1111,6 +1113,9 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0xff, 0xff), .driver_info = NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0xff, 0xff), + .driver_info = NUMEP2 }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0xff, 0xff), .driver_info = NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0, 0) }, @@ -1128,6 +1133,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), .driver_info = ZLP }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200S_CN, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, @@ -1227,6 +1233,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1203, 0xff), /* Telit LE910Cx (RNDIS) */ .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1204, 0xff), /* Telit LE910Cx (MBIM) */ + .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c index 83da8236e3c8..c18bf8164bc2 100644 --- a/drivers/usb/serial/qcserial.c +++ b/drivers/usb/serial/qcserial.c @@ -165,6 +165,7 @@ static const struct usb_device_id id_table[] = { {DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */ {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */ {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */ + {DEVICE_SWI(0x1199, 0x90d2)}, /* Sierra Wireless EM9191 QDL */ {DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c index 9858716698df..c15eec9cc460 100644 --- a/drivers/usb/typec/tcpm/tcpci.c +++ b/drivers/usb/typec/tcpm/tcpci.c @@ -696,7 +696,7 @@ irqreturn_t tcpci_irq(struct tcpci *tcpci) tcpm_pd_receive(tcpci->port, &msg); } - if (status & TCPC_ALERT_EXTENDED_STATUS) { + if (tcpci->data->vbus_vsafe0v && (status & TCPC_ALERT_EXTENDED_STATUS)) { ret = regmap_read(tcpci->regmap, TCPC_EXTENDED_STATUS, &raw); if (!ret && (raw & TCPC_EXTENDED_STATUS_VSAFE0V)) tcpm_vbus_change(tcpci->port); diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index a4d37205df54..7f2f3ff1b391 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -4876,6 +4876,7 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1, tcpm_set_state(port, SRC_ATTACH_WAIT, 0); break; case SRC_ATTACHED: + case SRC_STARTUP: case SRC_SEND_CAPABILITIES: case SRC_READY: if (tcpm_port_is_disconnected(port) || diff --git a/drivers/usb/typec/tipd/core.c b/drivers/usb/typec/tipd/core.c index 21b3ae25c76d..ea4cc0a6e40c 100644 --- a/drivers/usb/typec/tipd/core.c +++ b/drivers/usb/typec/tipd/core.c @@ -625,10 +625,6 @@ static int tps6598x_probe(struct i2c_client *client) if (ret < 0) return ret; - fwnode = device_get_named_child_node(&client->dev, "connector"); - if (!fwnode) - return -ENODEV; - /* * This fwnode has a "compatible" property, but is never populated as a * struct device. Instead we simply parse it to read the properties. @@ -636,7 +632,9 @@ static int tps6598x_probe(struct i2c_client *client) * with existing DT files, we work around this by deleting any * fwnode_links to/from this fwnode. */ - fw_devlink_purge_absent_suppliers(fwnode); + fwnode = device_get_named_child_node(&client->dev, "connector"); + if (fwnode) + fw_devlink_purge_absent_suppliers(fwnode); tps->role_sw = fwnode_usb_role_switch_get(fwnode); if (IS_ERR(tps->role_sw)) { diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 26e3d90d1e7c..841667a896dd 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -80,6 +80,7 @@ struct vduse_dev { struct vdpa_callback config_cb; struct work_struct inject; spinlock_t irq_lock; + struct rw_semaphore rwsem; int minor; bool broken; bool connected; @@ -410,6 +411,8 @@ static void vduse_dev_reset(struct vduse_dev *dev) if (domain->bounce_map) vduse_domain_reset_bounce_map(domain); + down_write(&dev->rwsem); + dev->status = 0; dev->driver_features = 0; dev->generation++; @@ -443,6 +446,8 @@ static void vduse_dev_reset(struct vduse_dev *dev) flush_work(&vq->inject); flush_work(&vq->kick); } + + up_write(&dev->rwsem); } static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx, @@ -885,6 +890,23 @@ static void vduse_vq_irq_inject(struct work_struct *work) spin_unlock_irq(&vq->irq_lock); } +static int vduse_dev_queue_irq_work(struct vduse_dev *dev, + struct work_struct *irq_work) +{ + int ret = -EINVAL; + + down_read(&dev->rwsem); + if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto unlock; + + ret = 0; + queue_work(vduse_irq_wq, irq_work); +unlock: + up_read(&dev->rwsem); + + return ret; +} + static long vduse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -966,8 +988,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; } case VDUSE_DEV_INJECT_CONFIG_IRQ: - ret = 0; - queue_work(vduse_irq_wq, &dev->inject); + ret = vduse_dev_queue_irq_work(dev, &dev->inject); break; case VDUSE_VQ_SETUP: { struct vduse_vq_config config; @@ -1053,9 +1074,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (index >= dev->vq_num) break; - ret = 0; index = array_index_nospec(index, dev->vq_num); - queue_work(vduse_irq_wq, &dev->vqs[index].inject); + ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject); break; } default: @@ -1136,6 +1156,7 @@ static struct vduse_dev *vduse_dev_create(void) INIT_LIST_HEAD(&dev->send_list); INIT_LIST_HEAD(&dev->recv_list); spin_lock_init(&dev->irq_lock); + init_rwsem(&dev->rwsem); INIT_WORK(&dev->inject, vduse_dev_irq_inject); init_waitqueue_head(&dev->waitq); diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 35927ceb26ff..39039e046117 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -173,6 +173,10 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) return -EINVAL; + if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK)) + for (i = 0; i < nvqs; i++) + vhost_vdpa_unsetup_vq_irq(v, i); + if (status == 0) { ret = ops->reset(vdpa); if (ret) @@ -184,10 +188,6 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) for (i = 0; i < nvqs; i++) vhost_vdpa_setup_vq_irq(v, i); - if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK)) - for (i = 0; i < nvqs; i++) - vhost_vdpa_unsetup_vq_irq(v, i); - return 0; } @@ -322,7 +322,7 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) struct eventfd_ctx *ctx; cb.callback = vhost_vdpa_config_cb; - cb.private = v->vdpa; + cb.private = v; if (copy_from_user(&fd, argp, sizeof(fd))) return -EFAULT; diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index b26b79dfcac9..6ed5e608dd04 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -2193,8 +2193,9 @@ config FB_HYPERV This framebuffer driver supports Microsoft Hyper-V Synthetic Video. config FB_SIMPLE - bool "Simple framebuffer support" - depends on (FB = y) && !DRM_SIMPLEDRM + tristate "Simple framebuffer support" + depends on FB + depends on !DRM_SIMPLEDRM select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c index c5b99a4861e8..6b4d5a7f3e15 100644 --- a/drivers/video/fbdev/gbefb.c +++ b/drivers/video/fbdev/gbefb.c @@ -1267,7 +1267,7 @@ static struct platform_device *gbefb_device; static int __init gbefb_init(void) { int ret = platform_driver_register(&gbefb_driver); - if (!ret) { + if (IS_ENABLED(CONFIG_SGI_IP32) && !ret) { gbefb_device = platform_device_alloc("gbefb", 0); if (gbefb_device) { ret = platform_device_add(gbefb_device); diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 0a5b54034d4b..236081afe9a2 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -239,6 +239,17 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } + /* + * Some devices detect legacy solely via F_VERSION_1. Write + * F_VERSION_1 to force LE config space accesses before FEATURES_OK for + * these when needed. + */ + if (drv->validate && !virtio_legacy_is_little_endian() + && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) { + dev->features = BIT_ULL(VIRTIO_F_VERSION_1); + dev->config->finalize_features(dev); + } + if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index dd95dfd85e98..3035bb6f5458 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -576,7 +576,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); if (!indirect && vq->use_dma_api) - vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags = + vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= ~VRING_DESC_F_NEXT; if (indirect) { diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 643c6c2d0b72..ced2fc0deb8c 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -71,8 +71,6 @@ #define TCOBASE(p) ((p)->tco_res->start) /* SMI Control and Enable Register */ #define SMI_EN(p) ((p)->smi_res->start) -#define TCO_EN (1 << 13) -#define GBL_SMI_EN (1 << 0) #define TCO_RLD(p) (TCOBASE(p) + 0x00) /* TCO Timer Reload/Curr. Value */ #define TCOv1_TMR(p) (TCOBASE(p) + 0x01) /* TCOv1 Timer Initial Value*/ @@ -357,12 +355,8 @@ static int iTCO_wdt_set_timeout(struct watchdog_device *wd_dev, unsigned int t) tmrval = seconds_to_ticks(p, t); - /* - * If TCO SMIs are off, the timer counts down twice before rebooting. - * Otherwise, the BIOS generally reboots when the SMI triggers. - */ - if (p->smi_res && - (inl(SMI_EN(p)) & (TCO_EN | GBL_SMI_EN)) != (TCO_EN | GBL_SMI_EN)) + /* For TCO v1 the timer counts down twice before rebooting */ + if (p->iTCO_version == 1) tmrval /= 2; /* from the specs: */ @@ -527,7 +521,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) * Disables TCO logic generating an SMI# */ val32 = inl(SMI_EN(p)); - val32 &= ~TCO_EN; /* Turn off SMI clearing watchdog */ + val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */ outl(val32, SMI_EN(p)); } diff --git a/drivers/watchdog/ixp4xx_wdt.c b/drivers/watchdog/ixp4xx_wdt.c index 2693ffb24ac7..31b03fa71341 100644 --- a/drivers/watchdog/ixp4xx_wdt.c +++ b/drivers/watchdog/ixp4xx_wdt.c @@ -119,7 +119,7 @@ static int ixp4xx_wdt_probe(struct platform_device *pdev) iwdt = devm_kzalloc(dev, sizeof(*iwdt), GFP_KERNEL); if (!iwdt) return -ENOMEM; - iwdt->base = dev->platform_data; + iwdt->base = (void __iomem *)dev->platform_data; /* * Retrieve rate from a fixed clock from the device tree if diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c index 1616f93dfad7..74d785b2b478 100644 --- a/drivers/watchdog/omap_wdt.c +++ b/drivers/watchdog/omap_wdt.c @@ -268,8 +268,12 @@ static int omap_wdt_probe(struct platform_device *pdev) wdev->wdog.bootstatus = WDIOF_CARDRESET; } - if (!early_enable) + if (early_enable) { + omap_wdt_start(&wdev->wdog); + set_bit(WDOG_HW_RUNNING, &wdev->wdog.status); + } else { omap_wdt_disable(wdev); + } ret = watchdog_register_device(&wdev->wdog); if (ret) { diff --git a/drivers/watchdog/sbsa_gwdt.c b/drivers/watchdog/sbsa_gwdt.c index ee9ff38929eb..9791c74aebd4 100644 --- a/drivers/watchdog/sbsa_gwdt.c +++ b/drivers/watchdog/sbsa_gwdt.c @@ -130,7 +130,7 @@ static u64 sbsa_gwdt_reg_read(struct sbsa_gwdt *gwdt) if (gwdt->version == 0) return readl(gwdt->control_base + SBSA_GWDT_WOR); else - return readq(gwdt->control_base + SBSA_GWDT_WOR); + return lo_hi_readq(gwdt->control_base + SBSA_GWDT_WOR); } static void sbsa_gwdt_reg_write(u64 val, struct sbsa_gwdt *gwdt) @@ -138,7 +138,7 @@ static void sbsa_gwdt_reg_write(u64 val, struct sbsa_gwdt *gwdt) if (gwdt->version == 0) writel((u32)val, gwdt->control_base + SBSA_GWDT_WOR); else - writeq(val, gwdt->control_base + SBSA_GWDT_WOR); + lo_hi_writeq(val, gwdt->control_base + SBSA_GWDT_WOR); } /* @@ -411,4 +411,3 @@ MODULE_AUTHOR("Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>"); MODULE_AUTHOR("Al Stone <al.stone@linaro.org>"); MODULE_AUTHOR("Timur Tabi <timur@codeaurora.org>"); MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:" DRV_NAME); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 22f5aff0c136..1b2c3aca6887 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -241,7 +241,7 @@ config XEN_PRIVCMD config XEN_ACPI_PROCESSOR tristate "Xen ACPI processor" - depends on XEN && XEN_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ + depends on XEN && XEN_PV_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ default m help This ACPI processor uploads Power Management information to the Xen @@ -259,7 +259,7 @@ config XEN_ACPI_PROCESSOR config XEN_MCE_LOG bool "Xen platform mcelog" - depends on XEN_DOM0 && X86_MCE + depends on XEN_PV_DOM0 && X86_MCE help Allow kernel fetching MCE error from Xen platform and converting it into Linux mcelog format for mcelog tools diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 43ebfe36ac27..3a50f097ed3e 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -491,12 +491,12 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * Stop waiting if either state is not BP_EAGAIN and ballooning action is - * needed, or if the credit has changed while state is BP_EAGAIN. + * Stop waiting if either state is BP_DONE and ballooning action is + * needed, or if the credit has changed while state is not BP_DONE. */ static bool balloon_thread_cond(enum bp_state state, long credit) { - if (state != BP_EAGAIN) + if (state == BP_DONE) credit = 0; return current_credit() != credit || kthread_should_stop(); @@ -516,10 +516,19 @@ static int balloon_thread(void *unused) set_freezable(); for (;;) { - if (state == BP_EAGAIN) - timeout = balloon_stats.schedule_delay * HZ; - else + switch (state) { + case BP_DONE: + case BP_ECANCELED: timeout = 3600 * HZ; + break; + case BP_EAGAIN: + timeout = balloon_stats.schedule_delay * HZ; + break; + case BP_WAIT: + timeout = HZ; + break; + } + credit = current_credit(); wait_event_freezable_timeout(balloon_thread_wq, diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 720a7b7abd46..3369734108af 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -257,7 +257,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) LIST_HEAD(pagelist); struct mmap_gfn_state state; - /* We only support privcmd_ioctl_mmap_batch for auto translated. */ + /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ if (xen_feature(XENFEAT_auto_translated_physmap)) return -ENOSYS; @@ -420,7 +420,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) int rc; struct page **pages; - pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); if (pages == NULL) return -ENOMEM; @@ -428,7 +428,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) if (rc != 0) { pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, numpgs, rc); - kfree(pages); + kvfree(pages); return -ENOMEM; } BUG_ON(vma->vm_private_data != NULL); @@ -803,21 +803,21 @@ static long privcmd_ioctl_mmap_resource(struct file *file, unsigned int domid = (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? DOMID_SELF : kdata.dom; - int num; + int num, *errs = (int *)pfns; + BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); num = xen_remap_domain_mfn_array(vma, kdata.addr & PAGE_MASK, - pfns, kdata.num, (int *)pfns, + pfns, kdata.num, errs, vma->vm_page_prot, - domid, - vma->vm_private_data); + domid); if (num < 0) rc = num; else if (num != kdata.num) { unsigned int i; for (i = 0; i < num; i++) { - rc = pfns[i]; + rc = errs[i]; if (rc < 0) break; } @@ -912,7 +912,7 @@ static void privcmd_close(struct vm_area_struct *vma) else pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", numpgs, rc); - kfree(pages); + kvfree(pages); } static vm_fault_t privcmd_fault(struct vm_fault *vmf) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index eb2151fb6049..1769a44f4819 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -23,7 +23,7 @@ struct fscache_netfs v9fs_cache_netfs = { .version = 0, }; -/** +/* * v9fs_random_cachetag - Generate a random tag to be associated * with a new cache session. * @@ -233,7 +233,7 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data, unlock_page(page); } -/** +/* * __v9fs_readpage_from_fscache - read a page from cache * * Returns 0 if the pages are in cache and a BIO is submitted, @@ -268,7 +268,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) } } -/** +/* * __v9fs_readpages_from_fscache - read multiple pages from cache * * Returns 0 if the pages are in cache and a BIO is submitted, @@ -308,7 +308,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode, } } -/** +/* * __v9fs_readpage_to_fscache - write a page to the cache * */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 9d9de62592be..b8863dd0de5c 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -19,18 +19,18 @@ #include "v9fs_vfs.h" #include "fid.h" +static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) +{ + hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); +} + + /** * v9fs_fid_add - add a fid to a dentry * @dentry: dentry that the fid is being added to * @fid: fid to add * */ - -static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) -{ - hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); -} - void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { spin_lock(&dentry->d_lock); @@ -67,7 +67,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) /** * v9fs_open_fid_add - add an open fid to an inode - * @dentry: inode that the fid is being added to + * @inode: inode that the fid is being added to * @fid: fid to add * */ diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index cdb99507ef33..2e0fa7c932db 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -155,6 +155,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) /** * v9fs_parse_options - parse mount options into session structure * @v9ses: existing v9fs session information + * @opts: The mount option string * * Return 0 upon success, -ERRNO upon failure. */ @@ -542,12 +543,9 @@ extern int v9fs_error_init(void); static struct kobject *v9fs_kobj; #ifdef CONFIG_9P_FSCACHE -/** - * caches_show - list caches associated with a session - * - * Returns the size of buffer written. +/* + * List caches associated with a session */ - static ssize_t caches_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index cce9ace651a2..1c4f1b39cc95 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -30,8 +30,7 @@ /** * v9fs_fid_readpage - read an entire page in from 9P - * - * @fid: fid being read + * @data: Opaque pointer to the fid being read * @page: structure to page * */ @@ -116,6 +115,8 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, /** * v9fs_release_page - release the private state associated with a page + * @page: The page to be released + * @gfp: The caller's allocation restrictions * * Returns 1 if the page can be released, false otherwise. */ @@ -129,9 +130,9 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) /** * v9fs_invalidate_page - Invalidate a page completely or partially - * - * @page: structure to page - * @offset: offset in the page + * @page: The page to be invalidated + * @offset: offset of the invalidated region + * @length: length of the invalidated region */ static void v9fs_invalidate_page(struct page *page, unsigned int offset, @@ -199,6 +200,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) /** * v9fs_launder_page - Writeback a dirty page + * @page: The page to be cleaned up + * * Returns 0 on success. */ @@ -219,6 +222,7 @@ static int v9fs_launder_page(struct page *page) /** * v9fs_direct_IO - 9P address space operation for direct I/O * @iocb: target I/O control block + * @iter: The data/buffer to use * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index aab5e6538660..246235ebdb70 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -359,14 +359,11 @@ out_err: } /** - * v9fs_file_read - read from a file - * @filp: file pointer to read - * @udata: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ - static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -388,11 +385,9 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t @@ -561,11 +556,9 @@ out_unlock: } /** - * v9fs_mmap_file_read - read from a file - * @filp: file pointer to read - * @data: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_mmap_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ static ssize_t @@ -576,11 +569,9 @@ v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_mmap_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_mmap_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 795706520b5e..08f48b70a741 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -218,7 +218,7 @@ v9fs_blank_wstat(struct p9_wstat *wstat) /** * v9fs_alloc_inode - helper function to allocate an inode - * + * @sb: The superblock to allocate the inode from */ struct inode *v9fs_alloc_inode(struct super_block *sb) { @@ -238,7 +238,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) /** * v9fs_free_inode - destroy an inode - * + * @inode: The inode to be freed */ void v9fs_free_inode(struct inode *inode) @@ -343,7 +343,7 @@ error: * v9fs_get_inode - helper function to setup an inode * @sb: superblock * @mode: mode to setup inode with - * + * @rdev: The device numbers to set */ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) @@ -369,7 +369,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) } /** - * v9fs_clear_inode - release an inode + * v9fs_evict_inode - Remove an inode from the inode cache * @inode: inode to release * */ @@ -665,14 +665,15 @@ error: /** * v9fs_vfs_create - VFS hook to create a regular file + * @mnt_userns: The user namespace of the mount + * @dir: The parent directory + * @dentry: The name of file to be created + * @mode: The UNIX file mode to set + * @excl: True if the file must not yet exist * * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called * for mknod(2). * - * @dir: directory inode that is being created - * @dentry: dentry that is being deleted - * @mode: create permissions - * */ static int @@ -696,6 +697,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, /** * v9fs_vfs_mkdir - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @mode: mode for new directory @@ -900,10 +902,12 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) /** * v9fs_vfs_rename - VFS hook to rename an inode + * @mnt_userns: The user namespace of the mount * @old_dir: old dir inode * @old_dentry: old dentry * @new_dir: new dir inode * @new_dentry: new dentry + * @flags: RENAME_* flags * */ @@ -1009,6 +1013,7 @@ done: /** * v9fs_vfs_getattr - retrieve file metadata + * @mnt_userns: The user namespace of the mount * @path: Object to query * @stat: metadata structure to populate * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -1050,6 +1055,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /** * v9fs_vfs_setattr - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -1285,6 +1291,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_symlink - helper function to create symlinks + * @mnt_userns: The user namespace of the mount * @dir: directory inode containing symlink * @dentry: dentry for symlink * @symname: symlink data @@ -1340,6 +1347,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @mode: mode for creation diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e1c0240b51c0..01b9e1281a29 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -37,7 +37,10 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** - * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object + * @dir_inode: The directory inode + * + * Helper function to get the gid for creating a * new file system object. This checks the S_ISGID to determine the owning * group of the new file system object. */ @@ -211,12 +214,13 @@ int v9fs_open_to_dotl_flags(int flags) /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @mnt_userns: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions + * @excl: True if the file must not yet exist * */ - static int v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) @@ -361,6 +365,7 @@ err_clunk_old_fid: /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory @@ -537,6 +542,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) /** * v9fs_vfs_setattr_dotl - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -816,6 +822,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod_dotl - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation diff --git a/fs/affs/super.c b/fs/affs/super.c index c6c2a513ec92..c609005a9eaa 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * blocks, we will have to change it. */ - size = i_size_read(sb->s_bdev->bd_inode) >> 9; + size = bdev_nr_sectors(sb->s_bdev); pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index dae9a57d7ec0..45cfd50a9521 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -86,8 +86,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode return afs_do_sync_operation(op); } -/** - * afs_sillyrename - Perform a silly-rename of a dentry +/* + * Perform silly-rename of a dentry. * * AFS is stateless and the server doesn't know when the client is holding a * file open. To prevent application problems when a file is unlinked while diff --git a/fs/afs/write.c b/fs/afs/write.c index 2dfe3b3a53d6..8b1d9c2f6bec 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -861,7 +861,8 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = thp_head(vmf->page); + struct folio *folio = page_folio(vmf->page); + struct page *page = &folio->page; struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); @@ -884,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) goto out; #endif - if (wait_on_page_writeback_killable(page)) + if (folio_wait_writeback_killable(folio)) goto out; if (lock_page_killable(page) < 0) @@ -894,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ - if (wait_on_page_writeback_killable(page) < 0) { - unlock_page(page); + if (folio_wait_writeback_killable(folio) < 0) { + folio_unlock(folio); goto out; } @@ -974,8 +975,7 @@ int afs_launder_page(struct page *page) iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); - ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE, - true); + ret = afs_store_data(vnode, &iter, page_offset(page) + f, true); } trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); @@ -1417,7 +1417,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb) spin_unlock_irqrestore(&ctx->ctx_lock, flags); } -static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) +static void aio_complete_rw(struct kiocb *kiocb, long res) { struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); @@ -1437,7 +1437,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) } iocb->ki_res.res = res; - iocb->ki_res.res2 = res2; + iocb->ki_res.res2 = 0; iocb_put(iocb); } @@ -1508,7 +1508,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) ret = -EINTR; fallthrough; default: - req->ki_complete(req, ret, 0); + req->ki_complete(req, ret); } } diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 16b5fca0626e..54c1f8b8b075 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -358,7 +358,7 @@ int autofs_wait(struct autofs_sb_info *sbi, qstr.len = strlen(p); offset = p - name; } - qstr.hash = full_name_hash(dentry, name, qstr.len); + qstr.hash = full_name_hash(dentry, qstr.name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(name); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a3b830b8410a..444e9c89ff3e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/list_sort.h> #include "misc.h" #include "ctree.h" #include "block-group.h" @@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) */ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); + kfree(cache->physical_map); kfree(cache); } } @@ -902,6 +904,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock); btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); path = btrfs_alloc_path(); if (!path) { @@ -1484,6 +1487,21 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +/* + * We want block groups with a low number of used bytes to be in the beginning + * of the list, so they will get reclaimed first. + */ +static int reclaim_bgs_cmp(void *unused, const struct list_head *a, + const struct list_head *b) +{ + const struct btrfs_block_group *bg1, *bg2; + + bg1 = list_entry(a, struct btrfs_block_group, bg_list); + bg2 = list_entry(b, struct btrfs_block_group, bg_list); + + return bg1->used > bg2->used; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&fs_info->unused_bgs_lock); + /* + * Sort happens under lock because we can't simply splice it and sort. + * The block groups might still be in use and reachable via bg_list, + * and their presence in the reclaim_bgs list must be preserved. + */ + list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; int ret = 0; @@ -1895,6 +1919,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); + INIT_LIST_HEAD(&cache->active_bg_list); btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); @@ -2035,6 +2060,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, */ if (btrfs_is_zoned(info)) { btrfs_calc_zone_unusable(cache); + /* Should not have any excluded extents. Just in case, though. */ + btrfs_free_excluded_extents(cache); } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; @@ -2062,15 +2089,18 @@ static int read_one_block_group(struct btrfs_fs_info *info, link_block_group(cache); set_avail_alloc_bits(info, cache->flags); - if (btrfs_chunk_readonly(info, cache->start)) { + if (btrfs_chunk_writeable(info, cache->start)) { + if (cache->used == 0) { + ASSERT(list_empty(&cache->bg_list)); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&info->discard_ctl, cache); + else + btrfs_mark_bg_unused(cache); + } + } else { inc_block_group_ro(cache, 1); - } else if (cache->used == 0) { - ASSERT(list_empty(&cache->bg_list)); - if (btrfs_test_opt(info, DISCARD_ASYNC)) - btrfs_discard_queue_work(&info->discard_ctl, cache); - else - btrfs_mark_bg_unused(cache); } + return 0; error: btrfs_put_block_group(cache); @@ -2438,6 +2468,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } + /* + * New block group is likely to be used soon. Try to activate it now. + * Failure is OK for now. + */ + btrfs_zone_activate(cache); + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2479,7 +2515,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, 0, &cache->space_info); + cache->bytes_super, cache->zone_unusable, + &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2594,7 +2631,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) if (!--cache->ro) { if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ - cache->zone_unusable = cache->alloc_offset - cache->used; + cache->zone_unusable = + (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); sinfo->bytes_zone_unusable += cache->zone_unusable; sinfo->bytes_readonly -= cache->zone_unusable; } @@ -3143,7 +3182,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) } int btrfs_update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc) + u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_block_group *cache = NULL; @@ -3380,36 +3419,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) */ check_system_chunk(trans, flags); - bg = btrfs_alloc_chunk(trans, flags); + bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; } - /* - * If this is a system chunk allocation then stop right here and do not - * add the chunk item to the chunk btree. This is to prevent a deadlock - * because this system chunk allocation can be triggered while COWing - * some extent buffer of the chunk btree and while holding a lock on a - * parent extent buffer, in which case attempting to insert the chunk - * item (or update the device item) would result in a deadlock on that - * parent extent buffer. In this case defer the chunk btree updates to - * the second phase of chunk allocation and keep our reservation until - * the second phase completes. - * - * This is a rare case and can only be triggered by the very few cases - * we have where we need to touch the chunk btree outside chunk allocation - * and chunk removal. These cases are basically adding a device, removing - * a device or resizing a device. - */ - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - return 0; - ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); /* * Normally we are not expected to fail with -ENOSPC here, since we have * previously reserved space in the system space_info and allocated one - * new system chunk if necessary. However there are two exceptions: + * new system chunk if necessary. However there are three exceptions: * * 1) We may have enough free space in the system space_info but all the * existing system block groups have a profile which can not be used @@ -3435,13 +3455,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) * with enough free space got turned into RO mode by a running scrub, * and in this case we have to allocate a new one and retry. We only * need do this allocate and retry once, since we have a transaction - * handle and scrub uses the commit root to search for block groups. + * handle and scrub uses the commit root to search for block groups; + * + * 3) We had one system block group with enough free space when we called + * check_system_chunk(), but after that, right before we tried to + * allocate the last extent buffer we needed, a discard operation came + * in and it temporarily removed the last free space entry from the + * block group (discard removes a free space entry, discards it, and + * then adds back the entry to the block group cache). */ if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; - sys_bg = btrfs_alloc_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3519,7 +3546,15 @@ out: * properly, either intentionally or as a bug. One example where this is * done intentionally is fsync, as it does not reserve any transaction units * and ends up allocating a variable number of metadata extents for log - * tree extent buffers. + * tree extent buffers; + * + * 4) The task has reserved enough transaction units / metadata space, but right + * before it tries to allocate the last extent buffer it needs, a discard + * operation comes in and, temporarily, removes the last free space entry from + * the only metadata block group that had free space (discard starts by + * removing a free space entry from a block group, then does the discard + * operation and, once it's done, it adds back the free space entry to the + * block group). * * We also need this 2 phases setup when adding a device to a filesystem with * a seed device - we must create new metadata and system chunks without adding @@ -3537,14 +3572,14 @@ out: * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of * the system chunk array due to concurrent allocations") provides more details. * - * For allocation of system chunks, we defer the updates and insertions into the - * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because - * if the chunk allocation is triggered while COWing an extent buffer of the - * chunk btree, we are holding a lock on the parent of that extent buffer and - * doing the chunk btree updates and insertions can require locking that parent. - * This is for the very few and rare cases where we update the chunk btree that - * are not chunk allocation or chunk removal: adding a device, removing a device - * or resizing a device. + * Allocation of system chunks does not happen through this function. A task that + * needs to update the chunk btree (the only btree that uses system chunks), must + * preallocate chunk space by calling either check_system_chunk() or + * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or + * metadata chunk or when removing a chunk, while the later is used before doing + * a modification to the chunk btree - use cases for the later are adding, + * removing and resizing a device as well as relocation of a system chunk. + * See the comment below for more details. * * The reservation of system space, done through check_system_chunk(), as well * as all the updates and insertions into the chunk btree must be done while @@ -3581,11 +3616,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (trans->allocating_chunk) return -ENOSPC; /* - * If we are removing a chunk, don't re-enter or we would deadlock. - * System space reservation and system chunk allocation is done by the - * chunk remove operation (btrfs_remove_chunk()). + * Allocation of system chunks can not happen through this path, as we + * could end up in a deadlock if we are allocating a data or metadata + * chunk and there is another task modifying the chunk btree. + * + * This is because while we are holding the chunk mutex, we will attempt + * to add the new chunk item to the chunk btree or update an existing + * device item in the chunk btree, while the other task that is modifying + * the chunk btree is attempting to COW an extent buffer while holding a + * lock on it and on its parent - if the COW operation triggers a system + * chunk allocation, then we can deadlock because we are holding the + * chunk mutex and we may need to access that extent buffer or its parent + * in order to add the chunk item or update a device item. + * + * Tasks that want to modify the chunk tree should reserve system space + * before updating the chunk btree, by calling either + * btrfs_reserve_chunk_metadata() or check_system_chunk(). + * It's possible that after a task reserves the space, it still ends up + * here - this happens in the cases described above at do_chunk_alloc(). + * The task will have to either retry or fail. */ - if (trans->removing_chunk) + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; space_info = btrfs_find_space_info(fs_info, flags); @@ -3684,17 +3735,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) return num_dev; } -/* - * Reserve space in the system space for allocating or removing a chunk - */ -void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +static void reserve_chunk_space(struct btrfs_trans_handle *trans, + u64 bytes, + u64 type) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; - u64 thresh; int ret = 0; - u64 num_devs; /* * Needed because we can end up allocating a system chunk and for an @@ -3707,19 +3755,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); - num_devs = get_profile_num_devs(fs_info, type); - - /* num_devs device items to update and 1 chunk item to add or remove */ - thresh = btrfs_calc_metadata_size(fs_info, num_devs) + - btrfs_calc_insert_metadata_size(fs_info, 1); - - if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", - left, thresh, type); + left, bytes, type); btrfs_dump_space_info(fs_info, info, 0, 0); } - if (left < thresh) { + if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; @@ -3728,21 +3770,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) * needing it, as we might not need to COW all nodes/leafs from * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). - * - * Also, if our caller is allocating a system chunk, do not - * attempt to insert the chunk item in the chunk btree, as we - * could deadlock on an extent buffer since our caller may be - * COWing an extent buffer from the chunk btree. */ - bg = btrfs_alloc_chunk(trans, flags); + bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); - } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + } else { /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore - * any error here. + * any error here. An ENOSPC here could happen, due to + * the cases described at do_chunk_alloc() - the system + * block group we just created was just turned into RO + * mode by a scrub for example, or a running discard + * temporarily removed its free space entries, etc. */ btrfs_chunk_alloc_add_chunk_item(trans, bg); } @@ -3751,12 +3792,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (!ret) { ret = btrfs_block_rsv_add(fs_info->chunk_root, &fs_info->chunk_block_rsv, - thresh, BTRFS_RESERVE_NO_FLUSH); + bytes, BTRFS_RESERVE_NO_FLUSH); if (!ret) - trans->chunk_bytes_reserved += thresh; + trans->chunk_bytes_reserved += bytes; } } +/* + * Reserve space in the system space for allocating or removing a chunk. + * The caller must be holding fs_info->chunk_mutex. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 num_devs = get_profile_num_devs(fs_info, type); + u64 bytes; + + /* num_devs device items to update and 1 chunk item to add or remove. */ + bytes = btrfs_calc_metadata_size(fs_info, num_devs) + + btrfs_calc_insert_metadata_size(fs_info, 1); + + reserve_chunk_space(trans, bytes, type); +} + +/* + * Reserve space in the system space, if needed, for doing a modification to the + * chunk btree. + * + * @trans: A transaction handle. + * @is_item_insertion: Indicate if the modification is for inserting a new item + * in the chunk btree or if it's for the deletion or update + * of an existing item. + * + * This is used in a context where we need to update the chunk btree outside + * block group allocation and removal, to avoid a deadlock with a concurrent + * task that is allocating a metadata or data block group and therefore needs to + * update the chunk btree while holding the chunk mutex. After the update to the + * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. + * + */ +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + u64 bytes; + + if (is_item_insertion) + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + else + bytes = btrfs_calc_metadata_size(fs_info, 1); + + mutex_lock(&fs_info->chunk_mutex); + reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); + mutex_unlock(&fs_info->chunk_mutex); +} + void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; @@ -3833,6 +3923,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->zone_active_bgs_lock); + while (!list_empty(&info->zone_active_bgs)) { + block_group = list_first_entry(&info->zone_active_bgs, + struct btrfs_block_group, + active_bg_list); + list_del_init(&block_group->active_bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->zone_active_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c72a71efcb18..5878b7ce3b78 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -98,6 +98,7 @@ struct btrfs_block_group { unsigned int to_copy:1; unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; + unsigned int zone_is_active:1; int disk_cache_state; @@ -202,7 +203,10 @@ struct btrfs_block_group { */ u64 alloc_offset; u64 zone_unusable; + u64 zone_capacity; u64 meta_write_pointer; + struct map_lookup *physical_map; + struct list_head active_bg_list; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc); + u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, @@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion); u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 76ee1452c57b..ab2a4a52e0bb 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -138,17 +138,34 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - /* total number of bytes pending delalloc, used by stat to calc the - * real block usage of the file - */ - u64 delalloc_bytes; - - /* - * Total number of bytes pending delalloc that fall within a file - * range that is either a hole or beyond EOF (and no prealloc extent - * exists in the range). This is always <= delalloc_bytes. - */ - u64 new_delalloc_bytes; + union { + /* + * Total number of bytes pending delalloc, used by stat to + * calculate the real block usage of the file. This is used + * only for files. + */ + u64 delalloc_bytes; + /* + * The offset of the last dir item key that was logged. + * This is used only for directories. + */ + u64 last_dir_item_offset; + }; + + union { + /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes and this + * is used only for files. + */ + u64 new_delalloc_bytes; + /* + * The offset of the last dir index key that was logged. + * This is used only for directories. + */ + u64 last_dir_index_offset; + }; /* * total number of bytes pending defrag, used by stat to check whether @@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) struct btrfs_dio_private { struct inode *inode; - u64 logical_offset; + + /* + * Since DIO can use anonymous page, we cannot use page_offset() to + * grab the file offset, thus need a dedicated member for file offset. + */ + u64 file_offset; u64 disk_bytenr; /* Used for bio::bi_size */ u32 bytes; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 86816088927f..7e9f90fa0388 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -186,7 +186,6 @@ struct btrfsic_dev_state { struct list_head collision_resolving_node; /* list node */ struct btrfsic_block dummy_block_for_bio_bh_flush; u64 last_flush_gen; - char name[BDEVNAME_SIZE]; }; struct btrfsic_block_hashtable { @@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; ds->bdev = NULL; ds->state = NULL; - ds->name[0] = '\0'; INIT_LIST_HEAD(&ds->collision_resolving_node); ds->last_flush_gen = 0; btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); @@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) btrfs_info_in_rcu(fs_info, - "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", + "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, - dev_state->name, dev_bytenr, + dev_state->bdev, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); @@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame: if (disk_item_offset + sizeof(struct btrfs_item) > sf->block_ctx->len) { leaf_item_out_of_bounce_error: - pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + pr_info( + "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", sf->block_ctx->start, - sf->block_ctx->dev->name); + sf->block_ctx->dev->bdev); goto one_stack_frame_backwards; } btrfsic_read_from_block_data(sf->block_ctx, @@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame: (uintptr_t)nodehdr; if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > sf->block_ctx->len) { - pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n", + pr_info( + "btrfsic: node item out of bounce at logical %llu, dev %pg\n", sf->block_ctx->start, - sf->block_ctx->dev->name); + sf->block_ctx->dev->bdev); goto one_stack_frame_backwards; } btrfsic_read_from_block_data( @@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block( if (next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", - next_bytenr, next_block_ctx->dev->name, + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + next_bytenr, next_block_ctx->dev->bdev, next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block), next_block->logical_bytenr); else - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", - next_bytenr, next_block_ctx->dev->name, + pr_info( + "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", + next_bytenr, next_block_ctx->dev->bdev, next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block)); @@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data( if (file_extent_item_offset + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); return -1; } @@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data( if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); return -1; } btrfsic_read_from_block_data(block_ctx, &file_extent_item, @@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data( next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) { - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n", + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", next_bytenr, - next_block_ctx.dev->name, + next_block_ctx.dev->bdev, next_block_ctx.dev_bytenr, mirror_num, next_block->logical_bytenr); @@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_fs_info *fs_info = state->fs_info; int ret; u64 length; - struct btrfs_bio *multi = NULL; + struct btrfs_io_context *multi = NULL; struct btrfs_device *device; length = len; @@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, struct bio *bio; unsigned int j; - bio = btrfs_io_bio_alloc(num_pages - i); + bio = btrfs_bio_alloc(num_pages - i); bio_set_dev(bio, block_ctx->dev->bdev); bio->bi_iter.bi_sector = dev_bytenr >> 9; bio->bi_opf = REQ_OP_READ; @@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } if (submit_bio_wait(bio)) { - pr_info("btrfsic: read error at logical %llu dev %s!\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: read error at logical %llu dev %pg!\n", + block_ctx->start, block_ctx->dev->bdev); bio_put(bio); return -1; } @@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state) list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { const struct btrfsic_block_link *l; - pr_info("%c-block @%llu (%s/%llu/%d)\n", + pr_info("%c-block @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num); list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { - pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n", + pr_info( + " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { - pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n", + pr_info( + " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); } @@ -1743,16 +1748,18 @@ again: if (block->logical_bytenr != bytenr && !(!block->is_metadata && block->logical_bytenr == 0)) - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", - bytenr, dev_state->name, + pr_info( +"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block), block->logical_bytenr); else - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", - bytenr, dev_state->name, + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); @@ -1767,8 +1774,9 @@ again: processed_len = state->datablock_size; bytenr = block->logical_bytenr; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", - bytenr, dev_state->name, dev_bytenr, + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); } @@ -1778,9 +1786,10 @@ again: list_empty(&block->ref_to_list) ? ' ' : '!', list_empty(&block->ref_from_list) ? ' ' : '!'); if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", btrfsic_get_block_type(state, block), bytenr, - dev_state->name, dev_bytenr, block->mirror_num, + dev_state->bdev, dev_bytenr, block->mirror_num, block->generation, btrfs_disk_key_objectid(&block->disk_key), block->disk_key.type, @@ -1792,9 +1801,10 @@ again: } if (!block->is_iodone && !block->never_written) { - pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", btrfsic_get_block_type(state, block), bytenr, - dev_state->name, dev_bytenr, block->mirror_num, + dev_state->bdev, dev_bytenr, block->mirror_num, block->generation, btrfs_stack_header_generation( (struct btrfs_header *) @@ -1921,8 +1931,9 @@ again: if (!is_metadata) { processed_len = state->datablock_size; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block (%s/%llu/?) !found in hash table, D.\n", - dev_state->name, dev_bytenr); + pr_info( + "written block (%pg/%llu/?) !found in hash table, D\n", + dev_state->bdev, dev_bytenr); if (!state->include_extent_data) { /* ignore that written D block */ goto continue_loop; @@ -1939,8 +1950,9 @@ again: btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n", - bytenr, dev_state->name, dev_bytenr); + pr_info( + "written block @%llu (%pg/%llu/?) !found in hash table, M\n", + bytenr, dev_state->bdev, dev_bytenr); } block_ctx.dev = dev_state; @@ -1995,9 +2007,9 @@ again: block->next_in_same_bio = NULL; } if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New written %c-block @%llu (%s/%llu/%d)\n", + pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", is_metadata ? 'M' : 'D', - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", bp->bi_status, btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, dev_state->name, + block->logical_bytenr, dev_state->bdev, block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; @@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp) dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io() new %s flush_gen=%llu\n", - dev_state->name, + pr_info("bio_end_io() new %pg flush_gen=%llu\n", + dev_state->bdev, dev_state->last_flush_gen); } if (block->submit_bio_bh_rw & REQ_FUA) @@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock( if (!(superblock->generation > state->max_superblock_generation || 0 == state->max_superblock_generation)) { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n", + pr_info( + "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", superblock->logical_bytenr, - superblock->dev_state->name, + superblock->dev_state->bdev, superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), state->max_superblock_generation); } else { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n", + pr_info( + "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", superblock->logical_bytenr, - superblock->dev_state->name, + superblock->dev_state->bdev, superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), state->max_superblock_generation); @@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, */ list_for_each_entry(l, &block->ref_to_list, node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n", + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); if (l->block_ref_to->never_written) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (!l->block_ref_to->is_iodone) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (l->block_ref_to->iodone_w_error) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; @@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, l->parent_generation && BTRFSIC_GENERATION_UNKNOWN != l->block_ref_to->generation) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, l->block_ref_to->generation, @@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, ret = -1; } else if (l->block_ref_to->flush_gen > l->block_ref_to->dev_state->last_flush_gen) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, block->flush_gen, l->block_ref_to->dev_state->last_flush_gen); @@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock( */ list_for_each_entry(l, &block->ref_from_list, node_ref_from) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n", + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); if (l->block_ref_from->is_superblock && @@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock( static void btrfsic_print_add_link(const struct btrfsic_state *state, const struct btrfsic_block_link *l) { - pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n", + pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } static void btrfsic_print_rem_link(const struct btrfsic_state *state, const struct btrfsic_block_link *l) { - pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n", + pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)", + indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num); if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { printk("[...]\n"); @@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( block->never_written = never_written; block->mirror_num = mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New %s%c-block @%llu (%s/%llu/%d)\n", + pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", additional_string, btrfsic_get_block_type(state, block), - block->logical_bytenr, dev_state->name, + block->logical_bytenr, dev_state->bdev, block->dev_bytenr, mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, } if (WARN_ON(!match)) { - pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n", - bytenr, dev_state->name, dev_bytenr); + pr_info( +"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", + bytenr, dev_state->bdev, dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { ret = btrfsic_map_block(state, bytenr, state->metablock_size, @@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, if (ret) continue; - pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n", - bytenr, block_ctx.dev->name, + pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", + bytenr, block_ctx.dev->bdev, block_ctx.dev_bytenr, mirror_num); } } @@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio) if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->name); + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); } else { struct btrfsic_block *const block = &dev_state->dummy_block_for_bio_bh_flush; @@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, list_for_each_entry(device, dev_head, dev_list) { struct btrfsic_dev_state *ds; - const char *p; if (!device->bdev || !device->name) continue; @@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, } ds->bdev = device->bdev; ds->state = state; - bdevname(ds->bdev, ds->name); - ds->name[BDEVNAME_SIZE - 1] = '\0'; - p = kbasename(ds->name); - strlcpy(ds->name, p, sizeof(ds->name)); btrfsic_dev_state_hashtable_add(ds, &btrfsic_dev_state_hashtable); } @@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) if (b_all->is_iodone || b_all->never_written) btrfsic_block_free(b_all); else - pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n", + pr_info( +"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7869ad12bc6e..32da97c3c19d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/kthread.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> @@ -28,6 +29,7 @@ #include "compression.h" #include "extent_io.h" #include "extent_map.h" +#include "subpage.h" #include "zoned.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -172,16 +174,17 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, /* Hash through the page sector by sector */ for (pg_offset = 0; pg_offset < bytes_left; pg_offset += sectorsize) { - kaddr = page_address(page); + kaddr = kmap_atomic(page); crypto_shash_digest(shash, kaddr + pg_offset, sectorsize, csum); + kunmap_atomic(kaddr); if (memcmp(&csum, cb_sum, csum_size) != 0) { btrfs_print_data_csum_error(inode, disk_start, csum, cb_sum, cb->mirror_num); - if (btrfs_io_bio(bio)->device) + if (btrfs_bio(bio)->device) btrfs_dev_stat_inc_and_print( - btrfs_io_bio(bio)->device, + btrfs_bio(bio)->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); return -EIO; } @@ -192,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, return 0; } +/* + * Reduce bio and io accounting for a compressed_bio with its corresponding bio. + * + * Return true if there is no pending bio nor io. + * Return false otherwise. + */ +static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + unsigned int bi_size = 0; + bool last_io = false; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + /* + * At endio time, bi_iter.bi_size doesn't represent the real bio size. + * Thus here we have to iterate through all segments to grab correct + * bio size. + */ + bio_for_each_segment_all(bvec, bio, iter_all) + bi_size += bvec->bv_len; + + if (bio->bi_status) + cb->errors = 1; + + ASSERT(bi_size && bi_size <= cb->compressed_len); + last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, + &cb->pending_sectors); + /* + * Here we must wake up the possible error handler after all other + * operations on @cb finished, or we can race with + * finish_compressed_bio_*() which may free @cb. + */ + wake_up_var(cb); + + return last_io; +} + +static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +{ + unsigned int index; + struct page *page; + + /* Release the compressed pages */ + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + put_page(page); + } + + /* Do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(bio); + ASSERT(!bio->bi_status); + /* + * We have verified the checksum already, set page checked so + * the end_io handlers know about it + */ + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { + u64 bvec_start = page_offset(bvec->bv_page) + + bvec->bv_offset; + + btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), + bvec->bv_page, bvec_start, + bvec->bv_len); + } + + bio_endio(cb->orig_bio); + } + + /* Finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +} + /* when we finish reading compressed pages from the disk, we * decompress them and then run the bio end_io routines on the * decompressed pages (in the inode address space). @@ -206,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; - struct page *page; - unsigned int index; - unsigned int mirror = btrfs_io_bio(bio)->mirror_num; + unsigned int mirror = btrfs_bio(bio)->mirror_num; int ret = 0; - if (bio->bi_status) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!refcount_dec_and_test(&cb->pending_bios)) + if (!dec_and_test_compressed_bio(cb, bio)) goto out; /* * Record the correct mirror_num in cb->orig_bio so that * read-repair can work properly. */ - btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; + btrfs_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; /* @@ -248,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio) csum_failed: if (ret) cb->errors = 1; - - /* release the compressed pages */ - index = 0; - for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; - page->mapping = NULL; - put_page(page); - } - - /* do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * we have verified the checksum already, set page - * checked so the end_io handlers know about it - */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) - SetPageChecked(bvec->bv_page); - - bio_endio(cb->orig_bio); - } - - /* finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); + finish_compressed_bio_read(cb, bio); out: bio_put(bio); } @@ -289,6 +336,7 @@ out: static noinline void end_compressed_writeback(struct inode *inode, const struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; @@ -311,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode, for (i = 0; i < ret; i++) { if (cb->errors) SetPageError(pages[i]); - end_page_writeback(pages[i]); + btrfs_page_clamp_clear_writeback(fs_info, pages[i], + cb->start, cb->len); put_page(pages[i]); } nr_pages -= ret; @@ -320,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode, /* the inode may be gone now */ } -/* - * do the cleanup once all the compressed pages hit the disk. - * This will clear writeback on the file pages and free the compressed - * pages. - * - * This also calls the writeback end hooks for the file pages so that - * metadata and checksums can be updated in the file. - */ -static void end_compressed_bio_write(struct bio *bio) +static void finish_compressed_bio_write(struct compressed_bio *cb) { - struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - struct page *page; + struct inode *inode = cb->inode; unsigned int index; - if (bio->bi_status) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!refcount_dec_and_test(&cb->pending_bios)) - goto out; - - /* ok, we're the last bio for this extent, step one is to - * call back into the FS and do all the end_io operations + /* + * Ok, we're the last bio for this extent, step one is to call back + * into the FS and do all the end_io operations. */ - inode = cb->inode; - btrfs_record_physical_zoned(inode, cb->start, bio); btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, !cb->errors); end_compressed_writeback(inode, cb); - /* note, our inode could be gone now */ + /* Note, our inode could be gone now */ /* - * release the compressed pages, these came from alloc_page and + * Release the compressed pages, these came from alloc_page and * are not attached to the inode at all */ - index = 0; for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; + struct page *page = cb->compressed_pages[index]; + page->mapping = NULL; put_page(page); } - /* finally free the cb struct */ + /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); +} + +/* + * Do the cleanup once all the compressed pages hit the disk. This will clear + * writeback on the file pages and free the compressed pages. + * + * This also calls the writeback end hooks for the file pages so that metadata + * and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio) +{ + struct compressed_bio *cb = bio->bi_private; + + if (!dec_and_test_compressed_bio(cb, bio)) + goto out; + + btrfs_record_physical_zoned(cb->inode, cb->start, bio); + + finish_compressed_bio_write(cb); out: bio_put(bio); } +static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, + struct compressed_bio *cb, + struct bio *bio, int mirror_num) +{ + blk_status_t ret; + + ASSERT(bio->bi_iter.bi_size); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); + if (ret) + return ret; + ret = btrfs_map_bio(fs_info, bio, mirror_num); + return ret; +} + +/* + * Allocate a compressed_bio, which will be used to read/write on-disk + * (aka, compressed) * data. + * + * @cb: The compressed_bio structure, which records all the needed + * information to bind the compressed data to the uncompressed + * page cache. + * @disk_byten: The logical bytenr where the compressed data will be read + * from or written to. + * @endio_func: The endio function to call after the IO for compressed data + * is finished. + * @next_stripe_start: Return value of logical bytenr of where next stripe starts. + * Let the caller know to only fill the bio up to the stripe + * boundary. + */ + + +static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, + unsigned int opf, bio_end_io_t endio_func, + u64 *next_stripe_start) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + struct btrfs_io_geometry geom; + struct extent_map *em; + struct bio *bio; + int ret; + + bio = btrfs_bio_alloc(BIO_MAX_VECS); + + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bio->bi_opf = opf; + bio->bi_private = cb; + bio->bi_end_io = endio_func; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); + if (IS_ERR(em)) { + bio_put(bio); + return ERR_CAST(em); + } + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); + + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); + free_extent_map(em); + if (ret < 0) { + bio_put(bio); + return ERR_PTR(ret); + } + *next_stripe_start = disk_bytenr + geom.len; + + return bio; +} + /* * worker function to build and submit bios for previously compressed pages. * The corresponding pages in the inode should be marked for writeback @@ -394,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; struct compressed_bio *cb; - unsigned long bytes_left; - int pg_index = 0; - struct page *page; - u64 first_byte = disk_start; + u64 cur_disk_bytenr = disk_start; + u64 next_stripe_start; blk_status_t ret; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; - WARN_ON(!PAGE_ALIGNED(start)); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); cb->errors = 0; cb->inode = &inode->vfs_inode; cb->start = start; @@ -418,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; - bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = bio_op | write_flags; - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - - if (use_append) { - struct btrfs_device *device; - - device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); - if (IS_ERR(device)) { - kfree(cb); - bio_put(bio); - return BLK_STS_NOTSUPP; + while (cur_disk_bytenr < disk_start + compressed_len) { + u64 offset = cur_disk_bytenr - disk_start; + unsigned int index = offset >> PAGE_SHIFT; + unsigned int real_size; + unsigned int added; + struct page *page = compressed_pages[index]; + bool submit = false; + + /* Allocate new bio if submitted or not yet allocated */ + if (!bio) { + bio = alloc_compressed_bio(cb, cur_disk_bytenr, + bio_op | write_flags, end_compressed_bio_write, + &next_stripe_start); + if (IS_ERR(bio)) { + ret = errno_to_blk_status(PTR_ERR(bio)); + bio = NULL; + goto finish_cb; + } } - - bio_set_dev(bio, device->bdev); - } - - if (blkcg_css) { - bio->bi_opf |= REQ_CGROUP_PUNT; - kthread_associate_blkcg(blkcg_css); - } - refcount_set(&cb->pending_bios, 1); - - /* create and submit bios for the compressed pages */ - bytes_left = compressed_len; - for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { - int submit = 0; - int len = 0; - - page = compressed_pages[pg_index]; - page->mapping = inode->vfs_inode.i_mapping; - if (bio->bi_iter.bi_size) - submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, - 0); - /* - * Page can only be added to bio if the current bio fits in - * stripe. + * We should never reach next_stripe_start start as we will + * submit comp_bio when reach the boundary immediately. */ - if (!submit) { - if (pg_index == 0 && use_append) - len = bio_add_zone_append_page(bio, page, - PAGE_SIZE, 0); - else - len = bio_add_page(bio, page, PAGE_SIZE, 0); - } - - page->mapping = NULL; - if (submit || len < PAGE_SIZE) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - refcount_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ + ASSERT(cur_disk_bytenr != next_stripe_start); + /* + * We have various limits on the real read size: + * - stripe boundary + * - page boundary + * - compressed length boundary + */ + real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); + real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + + if (use_append) + added = bio_add_zone_append_page(bio, page, real_size, + offset_in_page(offset)); + else + added = bio_add_page(bio, page, real_size, + offset_in_page(offset)); + /* Reached zoned boundary */ + if (added == 0) + submit = true; + + cur_disk_bytenr += added; + /* Reached stripe boundary */ + if (cur_disk_bytenr == next_stripe_start) + submit = true; + + /* Finished the range */ + if (cur_disk_bytenr == disk_start + compressed_len) + submit = true; + + if (submit) { if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, 1); - BUG_ON(ret); /* -ENOMEM */ - } - - ret = btrfs_map_bio(fs_info, bio, 0); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); + if (ret) + goto finish_cb; } - bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = bio_op | write_flags; - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - /* - * Use bio_add_page() to ensure the bio has at least one - * page. - */ - bio_add_page(bio, page, PAGE_SIZE, 0); + ret = submit_compressed_bio(fs_info, cb, bio, 0); + if (ret) + goto finish_cb; + bio = NULL; } - if (bytes_left < PAGE_SIZE) { - btrfs_info(fs_info, - "bytes left %lu compress len %u nr %u", - bytes_left, cb->compressed_len, cb->nr_pages); - } - bytes_left -= PAGE_SIZE; - first_byte += PAGE_SIZE; cond_resched(); } + if (blkcg_css) + kthread_associate_blkcg(NULL); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); - BUG_ON(ret); /* -ENOMEM */ - } + return 0; - ret = btrfs_map_bio(fs_info, bio, 0); - if (ret) { +finish_cb: + if (bio) { bio->bi_status = ret; bio_endio(bio); } + /* Last byte of @cb is submitted, endio will free @cb */ + if (cur_disk_bytenr == disk_start + compressed_len) + return ret; - if (blkcg_css) - kthread_associate_blkcg(NULL); - - return 0; + wait_var_event(cb, refcount_read(&cb->pending_sectors) == + (disk_start + compressed_len - cur_disk_bytenr) >> + fs_info->sectorsize_bits); + /* + * Even with previous bio ended, we should still have io not yet + * submitted, thus need to finish manually. + */ + ASSERT(refcount_read(&cb->pending_sectors)); + /* Now we are the only one referring @cb, can finish it safely. */ + finish_compressed_bio_write(cb); + return ret; } static u64 bio_end_offset(struct bio *bio) @@ -539,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio) return page_offset(last->bv_page) + last->bv_len + last->bv_offset; } +/* + * Add extra pages in the same compressed file extent so that we don't need to + * re-read the same extent again and again. + * + * NOTE: this won't work well for subpage, as for subpage read, we lock the + * full page then submit bio for each compressed/regular extents. + * + * This means, if we have several sectors in the same page points to the same + * on-disk compressed data, we will re-read the same extent many times and + * this function can only help for the next page. + */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; - unsigned long pg_index; - u64 last_offset; + u64 cur = bio_end_offset(cb->orig_bio); u64 isize = i_size_read(inode); int ret; struct page *page; - unsigned long nr_pages = 0; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; struct extent_io_tree *tree; - u64 end; - int misses = 0; + int sectors_missed = 0; - last_offset = bio_end_offset(cb->orig_bio); em_tree = &BTRFS_I(inode)->extent_tree; tree = &BTRFS_I(inode)->io_tree; @@ -576,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - while (last_offset < compressed_end) { - pg_index = last_offset >> PAGE_SHIFT; + while (cur < compressed_end) { + u64 page_end; + u64 pg_index = cur >> PAGE_SHIFT; + u32 add_size; if (pg_index > end_index) break; page = xa_load(&mapping->i_pages, pg_index); if (page && !xa_is_value(page)) { - misses++; - if (misses > 4) + sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + fs_info->sectorsize_bits; + + /* Beyond threshold, no need to continue */ + if (sectors_missed > 4) break; - goto next; + + /* + * Jump to next page start as we already have page for + * current offset. + */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; } page = __page_cache_alloc(mapping_gfp_constraint(mapping, @@ -597,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { put_page(page); - goto next; + /* There is already a page, skip to page end */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; } - /* - * at this point, we have a locked page in the page cache - * for these bytes in the file. But, we have to make - * sure they map to this compressed extent on disk. - */ ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -612,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; } - end = last_offset + PAGE_SIZE - 1; - lock_extent(tree, last_offset, end); + page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + lock_extent(tree, cur, page_end); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, last_offset, - PAGE_SIZE); + em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); - if (!em || last_offset < em->start || - (last_offset + PAGE_SIZE > extent_map_end(em)) || + /* + * At this point, we have a locked page in the page cache for + * these bytes in the file. But, we have to make sure they map + * to this compressed extent on disk. + */ + if (!em || cur < em->start || + (cur + fs_info->sectorsize > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end); + unlock_extent(tree, cur, page_end); unlock_page(page); put_page(page); break; @@ -641,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode, } } - ret = bio_add_page(cb->orig_bio, page, - PAGE_SIZE, 0); - - if (ret == PAGE_SIZE) { - nr_pages++; - put_page(page); - } else { - unlock_extent(tree, last_offset, end); + add_size = min(em->start + em->len, page_end + 1) - cur; + ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); + if (ret != add_size) { + unlock_extent(tree, cur, page_end); unlock_page(page); put_page(page); break; } -next: - last_offset += PAGE_SIZE; + /* + * If it's subpage, we also need to increase its + * subpage::readers number, as at endio we will decrease + * subpage::readers and to unlock the page. + */ + if (fs_info->sectorsize < PAGE_SIZE) + btrfs_subpage_start_reader(fs_info, page, cur, add_size); + put_page(page); + cur += add_size; } return 0; } @@ -679,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned int compressed_len; unsigned int nr_pages; unsigned int pg_index; - struct page *page; - struct bio *comp_bio; - u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; + struct bio *comp_bio = NULL; + const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 cur_disk_byte = disk_bytenr; + u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -708,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb) goto out; - refcount_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -748,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = btrfs_bio_alloc(cur_disk_byte); - comp_bio->bi_opf = REQ_OP_READ; - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - refcount_set(&cb->pending_bios, 1); - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - u32 pg_len = PAGE_SIZE; - int submit = 0; + while (cur_disk_byte < disk_bytenr + compressed_len) { + u64 offset = cur_disk_byte - disk_bytenr; + unsigned int index = offset >> PAGE_SHIFT; + unsigned int real_size; + unsigned int added; + struct page *page = cb->compressed_pages[index]; + bool submit = false; + + /* Allocate new bio if submitted or not yet allocated */ + if (!comp_bio) { + comp_bio = alloc_compressed_bio(cb, cur_disk_byte, + REQ_OP_READ, end_compressed_bio_read, + &next_stripe_start); + if (IS_ERR(comp_bio)) { + ret = errno_to_blk_status(PTR_ERR(comp_bio)); + comp_bio = NULL; + goto finish_cb; + } + } + /* + * We should never reach next_stripe_start start as we will + * submit comp_bio when reach the boundary immediately. + */ + ASSERT(cur_disk_byte != next_stripe_start); + /* + * We have various limit on the real read size: + * - stripe boundary + * - page boundary + * - compressed length boundary + */ + real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); + real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset)); /* - * To handle subpage case, we need to make sure the bio only - * covers the range we need. - * - * If we're at the last page, truncate the length to only cover - * the remaining part. + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. */ - if (pg_index == nr_pages - 1) - pg_len = min_t(u32, PAGE_SIZE, - compressed_len - pg_index * PAGE_SIZE); + ASSERT(added == real_size); + cur_disk_byte += added; - page = cb->compressed_pages[pg_index]; - page->mapping = inode->i_mapping; - page->index = em_start >> PAGE_SHIFT; + /* Reached stripe boundary, need to submit */ + if (cur_disk_byte == next_stripe_start) + submit = true; - if (comp_bio->bi_iter.bi_size) - submit = btrfs_bio_fits_in_stripe(page, pg_len, - comp_bio, 0); + /* Has finished the range, need to submit */ + if (cur_disk_byte == disk_bytenr + compressed_len) + submit = true; - page->mapping = NULL; - if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) { + if (submit) { unsigned int nr_sectors; - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, - BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - refcount_inc(&cb->pending_bios); - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto finish_cb; nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); sums += fs_info->csum_size * nr_sectors; - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); - if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - - comp_bio = btrfs_bio_alloc(cur_disk_byte); - comp_bio->bi_opf = REQ_OP_READ; - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - - bio_add_page(comp_bio, page, pg_len, 0); + ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); + if (ret) + goto finish_cb; + comp_bio = NULL; } - cur_disk_byte += pg_len; } - - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ - - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); - if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - return 0; fail2: @@ -842,6 +951,26 @@ fail1: out: free_extent_map(em); return ret; +finish_cb: + if (comp_bio) { + comp_bio->bi_status = ret; + bio_endio(comp_bio); + } + /* All bytes of @cb is submitted, endio will free @cb */ + if (cur_disk_byte == disk_bytenr + compressed_len) + return ret; + + wait_var_event(cb, refcount_read(&cb->pending_sectors) == + (disk_bytenr + compressed_len - cur_disk_byte) >> + fs_info->sectorsize_bits); + /* + * Even with previous bio ended, we should still have io not yet + * submitted, thus need to finish @cb manually. + */ + ASSERT(refcount_read(&cb->pending_sectors)); + /* Now we are the only one referring @cb, can finish it safely. */ + finish_compressed_bio_read(cb, NULL); + return ret; } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 399be0b435bf..56eef0821e3e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -28,8 +28,8 @@ struct btrfs_inode; #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* number of bios pending for this compressed extent */ - refcount_t pending_bios; + /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ + refcount_t pending_sectors; /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 84627cbd5b5b..c3983bdaf4b8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/mm.h> +#include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -395,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (*cow_ret == buf) unlock_orig = 1; - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); @@ -2487,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, int ret; BUG_ON(!path->nodes[level]); - btrfs_assert_tree_locked(path->nodes[level]); + btrfs_assert_tree_write_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); @@ -2827,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (slot >= btrfs_header_nritems(upper) - 1) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); right = btrfs_read_node_slot(upper, slot + 1); /* @@ -3065,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems == 0) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); left = btrfs_read_node_slot(path->nodes[1], slot - 1); /* @@ -3581,40 +3582,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, } /* - * This function duplicate a item, giving 'new_key' to the new item. - * It guarantees both items live in the same tree leaf and the new item - * is contiguous with the original item. - * - * This allows us to split file extent in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_duplicate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct btrfs_key *new_key) -{ - struct extent_buffer *leaf; - int ret; - u32 item_size; - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ret = setup_leaf_for_split(trans, root, path, - item_size + sizeof(struct btrfs_item)); - if (ret) - return ret; - - path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, 1); - leaf = path->nodes[0]; - memcpy_extent_buffer(leaf, - btrfs_item_ptr_offset(leaf, path->slots[0]), - btrfs_item_ptr_offset(leaf, path->slots[0] - 1), - item_size); - return 0; -} - -/* * make the item pointed to by the path smaller. new_size indicates * how small to make it, and from_end tells us if we just chop bytes * off the end of the item or if we shift the item to chop bytes off @@ -3785,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items - * @cpu_key: array of keys for items to be inserted - * @data_size: size of the body of each item we are going to insert - * @nr: size of @cpu_key/@data_size arrays + * @batch: information about the batch of items to insert */ -void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr) +static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, + const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -3803,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, int slot; struct btrfs_map_token token; u32 total_size; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; - total_size = total_data + (nr * sizeof(struct btrfs_item)); + /* + * Before anything else, update keys in the parent and other ancestors + * if needed, then release the write locks on them, so that other tasks + * can use them while we modify the leaf. + */ if (path->slots[0] == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); fixup_low_keys(path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -3820,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); if (btrfs_leaf_free_space(leaf) < total_size) { btrfs_print_leaf(leaf); @@ -3849,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(&token, item); btrfs_set_token_item_offset(&token, item, - ioff - total_data); + ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), btrfs_item_nr_offset(slot), (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - total_data, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data - data_end); + data_end - batch->total_data_size, + BTRFS_LEAF_DATA_OFFSET + data_end, + old_data - data_end); data_end = old_data; } /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + for (i = 0; i < batch->nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - data_end -= data_size[i]; + data_end -= batch->data_sizes[i]; btrfs_set_token_item_offset(&token, item, data_end); - btrfs_set_token_item_size(&token, item, data_size[i]); + btrfs_set_token_item_size(&token, item, batch->data_sizes[i]); } - btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_set_header_nritems(leaf, nritems + batch->nr); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3883,26 +3849,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } /* + * Insert a new item into a leaf. + * + * @root: The root of the btree. + * @path: A path pointing to the target leaf and slot. + * @key: The key of the new item. + * @data_size: The size of the data associated with the new key. + */ +void btrfs_setup_item_for_insert(struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size) +{ + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + setup_items_for_insert(root, path, &batch); +} + +/* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr) + const struct btrfs_item_batch *batch) { int ret = 0; int slot; - int i; - u32 total_size = 0; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; + u32 total_size; - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); if (ret == 0) return -EEXIST; if (ret < 0) @@ -3911,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, nr); + setup_items_for_insert(root, path, batch); return 0; } @@ -3943,6 +3926,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* + * This function duplicates an item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item is + * contiguous with the original item. + * + * This allows us to split a file extent in place, keeping a lock on the leaf + * the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + btrfs_setup_item_for_insert(root, path, new_key, item_size); + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* * delete the pointer from a given node. * * the tree should have been previously balanced so the deletion does not diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dff2c8a3e059..7553e9dc5f93 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; +struct btrfs_bio; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -217,6 +218,9 @@ struct btrfs_root_backup { u8 unused_8[10]; } __attribute__ ((__packed__)); +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 + /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -269,7 +273,11 @@ struct btrfs_super_block { __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; + + /* Padded to 4096 bytes */ + u8 padding[565]; } __attribute__ ((__packed__)); +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* * Compat flags that we support. If any incompat flags are set other than the @@ -899,6 +907,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -1017,6 +1026,16 @@ struct btrfs_fs_info { spinlock_t treelog_bg_lock; u64 treelog_bg; + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr); +/* + * Describes a batch of items to insert in a btree. This is used by + * btrfs_insert_empty_items(). + */ +struct btrfs_item_batch { + /* + * Pointer to an array containing the keys of the items to insert (in + * sorted order). + */ + const struct btrfs_key *keys; + /* Pointer to an array containing the data size for each item to insert. */ + const u32 *data_sizes; + /* + * The sum of data sizes for all items. The caller can compute this while + * setting up the data_sizes array, so it ends up being more efficient + * than having btrfs_insert_empty_items() or setup_item_for_insert() + * doing it, as it would avoid an extra loop over a potentially large + * array, and in the case of setup_item_for_insert(), we would be doing + * it while holding a write lock on a leaf and often on upper level nodes + * too, unnecessarily increasing the size of a critical section. + */ + u32 total_data_size; + /* Size of the keys and data_sizes arrays (number of items in the batch). */ + int nr; +}; + +void btrfs_setup_item_for_insert(struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr); + const struct btrfs_item_batch *batch); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, const struct btrfs_key *key, u32 data_size) { - return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + return btrfs_insert_empty_items(trans, root, path, &batch); } int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); @@ -3030,7 +3082,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, @@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end); +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len); int btrfs_add_link(struct btrfs_trans_handle *trans, @@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); -int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, - unsigned long bio_flags); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); @@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); -int btrfs_defrag_file(struct inode *inode, struct file *file, +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_pages); + u64 newer_than, unsigned long max_to_defrag); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, @@ -3563,6 +3613,9 @@ do { \ (errno), fmt, ##args); \ } while (0) +#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ + &(fs_info)->fs_state))) + __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, @@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zoned != 0; } +static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) +{ + return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; +} + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1e08eb2b27f0..e164766dcc38 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { - LIST_HEAD(batch); + LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + struct btrfs_item_batch batch; int total_size; - int nitems; char *ins_data = NULL; - struct btrfs_key *ins_keys; - u32 *ins_sizes; int ret; - list_add_tail(&first_item->tree_list, &batch); - nitems = 1; + list_add_tail(&first_item->tree_list, &item_list); + batch.total_data_size = first_item->data_len; + batch.nr = 1; total_size = first_item->data_len + sizeof(struct btrfs_item); curr = first_item; @@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, if (total_size + next_size > max_size) break; - list_add_tail(&next->tree_list, &batch); - nitems++; + list_add_tail(&next->tree_list, &item_list); + batch.nr++; total_size += next_size; + batch.total_data_size += next->data_len; curr = next; } - if (nitems == 1) { - ins_keys = &first_item->key; - ins_sizes = &first_item->data_len; + if (batch.nr == 1) { + batch.keys = &first_item->key; + batch.data_sizes = &first_item->data_len; } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; int i = 0; - ins_data = kmalloc(nitems * sizeof(u32) + - nitems * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc(batch.nr * sizeof(u32) + + batch.nr * sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) { ret = -ENOMEM; goto out; } ins_sizes = (u32 *)ins_data; - ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32)); - list_for_each_entry(curr, &batch, tree_list) { + ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + list_for_each_entry(curr, &item_list, tree_list) { ins_keys[i] = curr->key; ins_sizes[i] = curr->data_len; i++; } } - ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes, - nitems); + ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret) goto out; - list_for_each_entry(curr, &batch, tree_list) { + list_for_each_entry(curr, &item_list, tree_list) { char *data_ptr; data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); @@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); - list_for_each_entry_safe(curr, next, &batch, tree_list) { + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ca848b183474..cca7e85e32dd 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 parent = generic_ref->parent; u8 ref_type; - is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID); + is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); BUG_ON(extent_op && extent_op->is_data); @@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(generic_ref->real_root) && - is_fstree(generic_ref->tree_ref.root) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { @@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.root, action, ref_type); - ref->root = generic_ref->tree_ref.root; + generic_ref->tree_ref.owning_root, action, + ref_type); + ref->root = generic_ref->tree_ref.owning_root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.root, 0, action, false, - is_system); + generic_ref->tree_ref.owning_root, 0, action, + false, is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.ref_root; + u64 ref_root = generic_ref->data_ref.owning_root; u64 owner = generic_ref->data_ref.ino; u64 offset = generic_ref->data_ref.offset; u8 ref_type; @@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root) && - is_fstree(generic_ref->real_root) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e22fba272e4f..91a3aabad150 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -186,8 +186,8 @@ enum btrfs_ref_type { struct btrfs_data_ref { /* For EXTENT_DATA_REF */ - /* Root which refers to this data extent */ - u64 ref_root; + /* Original root this data extent belongs to */ + u64 owning_root; /* Inode which refers to this data extent */ u64 ino; @@ -210,11 +210,11 @@ struct btrfs_tree_ref { int level; /* - * Root which refers to this tree block. + * Root which owns this tree block. * * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) */ - u64 root; + u64 owning_root; /* For non-skinny metadata, no special member needed */ }; @@ -231,17 +231,10 @@ struct btrfs_ref { */ bool skip_qgroup; - /* - * Optional. For which root is this modification. - * Mostly used for qgroup optimization. - * - * When unset, data/tree ref init code will populate it. - * In certain cases, we're modifying reference for a different root. - * E.g. COW fs tree blocks for balance. - * In that case, tree_ref::root will be fs tree, but we're doing this - * for reloc tree, then we should set @real_root to reloc tree. - */ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* Through which root is this modification. */ u64 real_root; +#endif u64 bytenr; u64 len; @@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, } static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, - int level, u64 root) + int level, u64 root, u64 mod_root, bool skip_qgroup) { +#ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - if (!generic_ref->real_root) - generic_ref->real_root = root; + generic_ref->real_root = mod_root ?: root; +#endif generic_ref->tree_ref.level = level; - generic_ref->tree_ref.root = root; + generic_ref->tree_ref.owning_root = root; generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + } static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset) + u64 ref_root, u64 ino, u64 offset, u64 mod_root, + bool skip_qgroup) { +#ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - if (!generic_ref->real_root) - generic_ref->real_root = ref_root; - generic_ref->data_ref.ref_root = ref_root; + generic_ref->real_root = mod_root ?: ref_root; +#endif + generic_ref->data_ref.owning_root = ref_root; generic_ref->data_ref.ino = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; } static inline struct btrfs_delayed_extent_op * diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d029be40ea6f..c85a7d44da79 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) { + struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID }; struct btrfs_key key; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; @@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found: * We don't have a replace item or it's corrupted. If there is * a replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { + if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); ret = -EUCLEAN; @@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found: * We don't have an active replace item but if there is a * replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { + if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "replace devid present without an active replace item"); ret = -EUCLEAN; @@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found: break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, - src_devid, NULL, NULL); - dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, - NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args); + args.devid = src_devid; + dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args); + /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing @@ -283,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - if (i_size_read(bdev->bd_inode) < - btrfs_device_get_total_bytes(srcdev)) { + if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) { btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f1274d5c3805..7721ce0c0604 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -190,9 +190,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( } /* - * lookup a directory item based on name. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory item by name. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir item does not exists, an error pointer if an error + * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -273,27 +284,42 @@ out: } /* - * lookup a directory item based on index. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory index item by name and index number. * - * The name is used to make sure the index really points to the name you were - * looking for. + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @index: The index number. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir index item does not exists, an error pointer if an + * error happened, or a pointer to a dir item if the dir index item exists and + * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod) { + struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = objectid; + key.offset = index; - return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (di == ERR_PTR(-ENOENT)) + return NULL; + + return di; } struct btrfs_dir_item * diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 355ea88d5c5f..59c3be8c1f4c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -683,7 +683,7 @@ err: return ret; } -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, +int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror) { @@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page) BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + btrfs_assert_tree_write_locked(eb); return __set_page_dirty_nobuffers(page); } ASSERT(PagePrivate(page) && page->private); @@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page) ASSERT(eb); ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); ASSERT(atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); @@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) struct btrfs_fs_info *fs_info = buf->fs_info; if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, @@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) goto fail; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + !btrfs_is_data_reloc_root(root)) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); + kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -1953,8 +1954,7 @@ sleep: wake_up_process(fs_info->cleaner_kthread); mutex_unlock(&fs_info->transaction_kthread_mutex); - if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, - &fs_info->fs_state))) + if (BTRFS_FS_ERROR(fs_info)) btrfs_cleanup_transaction(fs_info); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || @@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info, /* * For 4K page size, we only support 4K sector size. - * For 64K page size, we support read-write for 64K sector size, and - * read-only for 4K sector size. + * For 64K page size, we support 64K and 4K sector sizes. */ if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && @@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); + spin_lock_init(&fs_info->zone_active_bgs_lock); + spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); @@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); INIT_LIST_HEAD(&fs_info->reclaim_bgs); + INIT_LIST_HEAD(&fs_info->zone_active_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); @@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); btrfs_init_btree_inode(fs_info); - invalidate_bdev(fs_devices->latest_bdev); + invalidate_bdev(fs_devices->latest_dev->bdev); /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); + disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); if (IS_ERR(disk_super)) { err = PTR_ERR(disk_super); goto fail_alloc; @@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - if (sectorsize != PAGE_SIZE) { + if (sectorsize < PAGE_SIZE) { + struct btrfs_subpage_info *subpage_info; + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - } - if (sectorsize != PAGE_SIZE) { if (btrfs_super_incompat_flags(fs_info->super_copy) & BTRFS_FEATURE_INCOMPAT_RAID56) { btrfs_err(fs_info, @@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device err = -EINVAL; goto fail_alloc; } + subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); + if (!subpage_info) + goto fail_alloc; + btrfs_init_subpage_info(subpage_info, sectorsize); + fs_info->subpage_info = subpage_info; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); - if (!fs_devices->latest_bdev) { + if (!fs_devices->latest_dev->bdev) { btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && + !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; @@ -3740,7 +3748,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, else if (ret) return ERR_PTR(ret); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); @@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_opf |= REQ_FUA; btrfsic_submit_bio(bio); - btrfs_advance_sb_log(device, i); + + if (btrfs_advance_sb_log(device, i)) + errors++; } return errors < i ? 0 : -1; } @@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); if (root->reloc_root) { btrfs_put_root(root->reloc_root); @@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) || - test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) btrfs_error_commit_super(fs_info); kthread_stop(fs_info->transaction_kthread); @@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, fs_info->generation); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 0e7e9526b6a8..a2b5db4ba262 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,9 +6,6 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H -#define BTRFS_SUPER_INFO_OFFSET SZ_64K -#define BTRFS_SUPER_INFO_SIZE 4096 - #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, +int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fc3da7585fb7..3fd736a02c1e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1266,7 +1266,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1313,22 +1313,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_bio *bbio = NULL; - + struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bbio has devices + * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; int i; num_bytes = end - cur; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bbio, 0); + &num_bytes, &bioc, 0); /* * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or * -EOPNOTSUPP. For any such error, @num_bytes is not updated, @@ -1337,8 +1336,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, if (ret < 0) goto out; - stripe = bbio->stripes; - for (i = 0; i < bbio->num_stripes; i++, stripe++) { + stripe = bioc->stripes; + for (i = 0; i < bioc->num_stripes; i++, stripe++) { u64 bytes; struct btrfs_device *device = stripe->dev; @@ -1361,7 +1360,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, * And since there are two loops, explicitly * go to out to avoid confusion. */ - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); goto out; } @@ -1372,7 +1371,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, */ ret = 0; } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); cur += num_bytes; } out: @@ -1397,7 +1396,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -2376,7 +2375,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, out: btrfs_free_path(path); - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; } @@ -2438,10 +2437,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset); - generic_ref.skip_qgroup = for_reloc; + key.offset, root->root_key.objectid, + for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -2453,9 +2451,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = fs_info->nodesize; btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); - generic_ref.skip_qgroup = for_reloc; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, + root->root_key.objectid, for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -3196,7 +3193,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0); + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3289,7 +3286,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root->root_key.objectid); + root->root_key.objectid, 0, false); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { btrfs_ref_tree_mod(fs_info, &generic_ref); @@ -3373,9 +3370,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); ret = 0; @@ -3386,9 +3383,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -3476,7 +3473,9 @@ enum btrfs_extent_allocation_policy { */ struct find_free_extent_ctl { /* Basic allocation info */ + u64 ram_bytes; u64 num_bytes; + u64 min_alloc_size; u64 empty_size; u64 flags; int delalloc; @@ -3495,6 +3494,9 @@ struct find_free_extent_ctl { /* Allocation is called for tree-log */ bool for_treelog; + /* Allocation is called for data relocation */ + bool for_data_reloc; + /* RAID index, converted from flags */ int index; @@ -3756,8 +3758,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, u64 avail; u64 bytenr = block_group->start; u64 log_bytenr; + u64 data_reloc_bytenr; int ret = 0; - bool skip; + bool skip = false; ASSERT(btrfs_is_zoned(block_group->fs_info)); @@ -3767,19 +3770,49 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, */ spin_lock(&fs_info->treelog_bg_lock); log_bytenr = fs_info->treelog_bg; - skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || - (!ffe_ctl->for_treelog && bytenr == log_bytenr)); + if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr))) + skip = true; spin_unlock(&fs_info->treelog_bg_lock); if (skip) return 1; + /* + * Do not allow non-relocation blocks in the dedicated relocation block + * group, and vice versa. + */ + spin_lock(&fs_info->relocation_bg_lock); + data_reloc_bytenr = fs_info->data_reloc_bg; + if (data_reloc_bytenr && + ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || + (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) + skip = true; + spin_unlock(&fs_info->relocation_bg_lock); + if (skip) + return 1; + /* Check RO and no space case before trying to activate it */ + spin_lock(&block_group->lock); + if (block_group->ro || + block_group->alloc_offset == block_group->zone_capacity) { + spin_unlock(&block_group->lock); + return 1; + } + spin_unlock(&block_group->lock); + + if (!btrfs_zone_activate(block_group)) + return 1; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); + spin_lock(&fs_info->relocation_bg_lock); ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); + ASSERT(!ffe_ctl->for_data_reloc || + block_group->start == fs_info->data_reloc_bg || + fs_info->data_reloc_bg == 0); if (block_group->ro) { ret = 1; @@ -3796,7 +3829,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, goto out; } - avail = block_group->length - block_group->alloc_offset; + /* + * Do not allow currently used block group to be the data relocation + * dedicated block group. + */ + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); + avail = block_group->zone_capacity - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { /* @@ -3813,6 +3857,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); @@ -3829,6 +3876,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; + if (ret && ffe_ctl->for_data_reloc) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); @@ -3932,18 +3982,30 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) ffe_ctl->orig_have_caching_bg = true; - if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && - ffe_ctl->have_caching_bg) - return 1; - - if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) - return 1; - if (ins->objectid) { found_extent(ffe_ctl, ins); return 0; } + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && + !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) { + /* + * If we have enough free space left in an already active block + * group and we can't activate any other zone now, retry the + * active ones with a smaller allocation size. Returning early + * from here will tell btrfs_reserve_extent() to haven the + * size. + */ + return -ENOSPC; + } + + if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + return 1; + + ffe_ctl->index++; + if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) + return 1; + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along @@ -4085,6 +4147,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); } + if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } return 0; default: BUG(); @@ -4117,65 +4185,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_root *root, - u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte_orig, struct btrfs_key *ins, - u64 flags, int delalloc) + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; - struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool full_search = false; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - WARN_ON(num_bytes < fs_info->sectorsize); - - ffe_ctl.num_bytes = num_bytes; - ffe_ctl.empty_size = empty_size; - ffe_ctl.flags = flags; - ffe_ctl.search_start = 0; - ffe_ctl.delalloc = delalloc; - ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); - ffe_ctl.have_caching_bg = false; - ffe_ctl.orig_have_caching_bg = false; - ffe_ctl.found_offset = 0; - ffe_ctl.hint_byte = hint_byte_orig; - ffe_ctl.for_treelog = for_treelog; - ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); + ffe_ctl->search_start = 0; /* For clustered allocation */ - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - ffe_ctl.last_ptr = NULL; - ffe_ctl.use_cluster = true; + ffe_ctl->empty_cluster = 0; + ffe_ctl->last_ptr = NULL; + ffe_ctl->use_cluster = true; + ffe_ctl->have_caching_bg = false; + ffe_ctl->orig_have_caching_bg = false; + ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); + ffe_ctl->loop = 0; + /* For clustered allocation */ + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + ffe_ctl->cached = 0; + ffe_ctl->max_extent_size = 0; + ffe_ctl->total_free_space = 0; + ffe_ctl->found_offset = 0; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; if (btrfs_is_zoned(fs_info)) - ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, num_bytes, empty_size, flags); + trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, + ffe_ctl->flags); - space_info = btrfs_find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", flags); + btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); return -ENOSPC; } - ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); + ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); if (ret < 0) return ret; - ffe_ctl.search_start = max(ffe_ctl.search_start, - first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); - if (ffe_ctl.search_start == ffe_ctl.hint_byte) { + ffe_ctl->search_start = max(ffe_ctl->search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); + if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, - ffe_ctl.search_start); + ffe_ctl->search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -4183,7 +4248,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, flags) && + if (block_group && block_group_bits(block_group, ffe_ctl->flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4197,9 +4262,10 @@ static noinline int find_free_extent(struct btrfs_root *root, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - ffe_ctl.index = btrfs_bg_flags_to_raid_index( - block_group->flags); - btrfs_lock_block_group(block_group, delalloc); + ffe_ctl->index = btrfs_bg_flags_to_raid_index( + block_group->flags); + btrfs_lock_block_group(block_group, + ffe_ctl->delalloc); goto have_block_group; } } else if (block_group) { @@ -4207,31 +4273,33 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - ffe_ctl.have_caching_bg = false; - if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || - ffe_ctl.index == 0) + ffe_ctl->have_caching_bg = false; + if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || + ffe_ctl->index == 0) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, - &space_info->block_groups[ffe_ctl.index], list) { + &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { - if (for_treelog) + if (ffe_ctl->for_treelog) btrfs_clear_treelog_bg(block_group); + if (ffe_ctl->for_data_reloc) + btrfs_clear_data_reloc_bg(block_group); continue; } - btrfs_grab_block_group(block_group, delalloc); - ffe_ctl.search_start = block_group->start; + btrfs_grab_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->search_start = block_group->start; /* * this can happen if we end up cycling through all the * raid types, but we want to make sure we only allocate * for the proper type. */ - if (!block_group_bits(block_group, flags)) { + if (!block_group_bits(block_group, ffe_ctl->flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID56_MASK | @@ -4242,7 +4310,7 @@ search: * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ - if ((flags & extra) && !(block_group->flags & extra)) + if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) goto loop; /* @@ -4250,14 +4318,14 @@ search: * It's possible that we have MIXED_GROUP flag but no * block group is mixed. Just skip such block group. */ - btrfs_release_block_group(block_group, delalloc); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); continue; } have_block_group: - ffe_ctl.cached = btrfs_block_group_done(block_group); - if (unlikely(!ffe_ctl.cached)) { - ffe_ctl.have_caching_bg = true; + ffe_ctl->cached = btrfs_block_group_done(block_group); + if (unlikely(!ffe_ctl->cached)) { + ffe_ctl->have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); /* @@ -4280,10 +4348,11 @@ have_block_group: goto loop; bg_ret = NULL; - ret = do_allocation(block_group, &ffe_ctl, &bg_ret); + ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, delalloc); + btrfs_release_block_group(block_group, + ffe_ctl->delalloc); block_group = bg_ret; } } else if (ret == -EAGAIN) { @@ -4293,46 +4362,49 @@ have_block_group: } /* Checks */ - ffe_ctl.search_start = round_up(ffe_ctl.found_offset, - fs_info->stripesize); + ffe_ctl->search_start = round_up(ffe_ctl->found_offset, + fs_info->stripesize); /* move on to the next group */ - if (ffe_ctl.search_start + num_bytes > + if (ffe_ctl->search_start + ffe_ctl->num_bytes > block_group->start + block_group->length) { btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, num_bytes); + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } - if (ffe_ctl.found_offset < ffe_ctl.search_start) + if (ffe_ctl->found_offset < ffe_ctl->search_start) btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + ffe_ctl->found_offset, + ffe_ctl->search_start - ffe_ctl->found_offset); - ret = btrfs_add_reserved_bytes(block_group, ram_bytes, - num_bytes, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, + ffe_ctl->num_bytes, + ffe_ctl->delalloc); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, num_bytes); + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = ffe_ctl.search_start; - ins->offset = num_bytes; + ins->objectid = ffe_ctl->search_start; + ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, - num_bytes); - btrfs_release_block_group(block_group, delalloc); + trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, + ffe_ctl->num_bytes); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: - release_block_group(block_group, &ffe_ctl, delalloc); + release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); if (ret > 0) goto search; @@ -4341,12 +4413,12 @@ loop: * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. */ - if (!ffe_ctl.max_extent_size) - ffe_ctl.max_extent_size = ffe_ctl.total_free_space; + if (!ffe_ctl->max_extent_size) + ffe_ctl->max_extent_size = ffe_ctl->total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = ffe_ctl.max_extent_size; + space_info->max_extent_size = ffe_ctl->max_extent_size; spin_unlock(&space_info->lock); - ins->offset = ffe_ctl.max_extent_size; + ins->offset = ffe_ctl->max_extent_size; } else if (ret == -ENOSPC) { ret = cache_block_group_error; } @@ -4404,16 +4476,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, struct btrfs_key *ins, int is_data, int delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; + struct find_free_extent_ctl ffe_ctl = {}; bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, - hint_byte, ins, flags, delalloc); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.min_alloc_size = min_alloc_size; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.delalloc = delalloc; + ffe_ctl.hint_byte = hint_byte; + ffe_ctl.for_treelog = for_treelog; + ffe_ctl.for_data_reloc = for_data_reloc; + + ret = find_free_extent(root, ins, &ffe_ctl); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); } else if (ret == -ENOSPC) { @@ -4431,8 +4515,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu tree-log %d", - flags, num_bytes, for_treelog); + "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", + flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); @@ -4543,7 +4627,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1); + ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -4632,7 +4716,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, 1); + fs_info->nodesize, true); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", extent_key.objectid, extent_key.offset); @@ -4655,7 +4739,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -4847,8 +4932,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins.objectid, ins.offset, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_init_tree_ref(&generic_ref, level, root_objectid, + root->root_key.objectid, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) @@ -4859,6 +4944,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, out_free_delayed: btrfs_free_delayed_extent_op(extent_op); out_free_buf: + btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); @@ -5264,7 +5350,8 @@ skip: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, fs_info->nodesize, parent); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; @@ -5749,13 +5836,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; } - btrfs_assert_tree_locked(parent); + btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); - btrfs_assert_tree_locked(node); + btrfs_assert_tree_write_locked(node); level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aaddd7225348..4e03a6d3aa32 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -241,7 +241,7 @@ int __init extent_io_init(void) return -ENOMEM; if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), + offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) goto free_buffer_cache; @@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode, /* * Find and lock a contiguous range of bytes in the file marked as delalloc, no - * more than @max_bytes. @Start and @end are used to return the range, + * more than @max_bytes. * - * Return: true if we find something - * false if nothing was in the tree + * @start: The original start bytenr to search. + * Will store the extent range start bytenr. + * @end: The original end bytenr of the search range + * Will store the extent range end bytenr. + * + * Return true if we find a delalloc range which starts inside the original + * range, and @start/@end will store the delalloc range start/end. + * + * Return false if we can't find any delalloc range which starts inside the + * original range, and @start/@end will be the non-delalloc range start/end. */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, @@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, u64 *end) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + const u64 orig_start = *start; + const u64 orig_end = *end; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, int ret; int loops = 0; + /* Caller should pass a valid @end to indicate the search range end */ + ASSERT(orig_end > orig_start); + + /* The range should at least cover part of the page */ + ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || + orig_end <= page_offset(locked_page))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); - if (!found || delalloc_end <= *start) { + if (!found || delalloc_end <= *start || delalloc_start > orig_end) { *start = delalloc_start; - *end = delalloc_end; + + /* @delalloc_end can be -1, never go beyond @orig_end */ + *end = min(delalloc_end, orig_end); free_extent_state(cached_state); return false; } @@ -2282,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree, * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) +static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; u64 sector; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int ret; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); @@ -2299,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_is_zoned(fs_info)) return btrfs_repair_one_zone(fs_info, logical); - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; /* - * Avoid races with device replace and make sure our bbio has devices + * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are doing the * read repair operation. */ @@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, * stripe's dev and sector. */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bbio, 0); + &map_length, &bioc, 0); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - ASSERT(bbio->mirror_num == 1); + ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); + &map_length, &bioc, mirror_num); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - BUG_ON(mirror_num != bbio->mirror_num); + BUG_ON(mirror_num != bioc->mirror_num); } - sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[bbio->mirror_num - 1].dev; - btrfs_put_bbio(bbio); + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { btrfs_bio_counter_dec(fs_info); @@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); + struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; - struct btrfs_io_bio *repair_io_bio; + struct btrfs_bio *repair_bbio; blk_status_t status; btrfs_debug(fs_info, @@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode, return -EIO; } - repair_bio = btrfs_io_bio_alloc(1); - repair_io_bio = btrfs_io_bio(repair_bio); + repair_bio = btrfs_bio_alloc(1); + repair_bbio = btrfs_bio(repair_bio); repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; repair_bio->bi_private = failed_bio->bi_private; - if (failed_io_bio->csum) { + if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; - repair_io_bio->csum = repair_io_bio->csum_inline; - memcpy(repair_io_bio->csum, - failed_io_bio->csum + csum_size * icsum, csum_size); + repair_bbio->csum = repair_bbio->csum_inline; + memcpy(repair_bbio->csum, + failed_bbio->csum + csum_size * icsum, csum_size); } bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_io_bio->logical = failrec->start; - repair_io_bio->iter = repair_bio->bi_iter; + repair_bbio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), "repair read error: submitting new read to mirror %d", @@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage( static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_bio *bbio = btrfs_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* @@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, - io_bio->mirror_num); + bbio->mirror_num); tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio) end = start + bvec->bv_len - 1; len = bvec->bv_len; - mirror = io_bio->mirror_num; + mirror = bbio->mirror_num; if (likely(uptodate)) { if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(io_bio, + error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); ret = error_bitmap; } else { - ret = btrfs_validate_metadata_buffer(io_bio, + ret = btrfs_validate_metadata_buffer(bbio, page, start, end, mirror); } if (ret) @@ -3106,7 +3123,7 @@ readpage_ok: } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_io_bio_free_csum(io_bio); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -3115,53 +3132,43 @@ readpage_ok: * new bio by bio_alloc_bioset as it does not initialize the bytes outside of * 'bio' because use of __GFP_ZERO is not supported. */ -static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) +static inline void btrfs_bio_init(struct btrfs_bio *bbio) { - memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); } /* - * The following helpers allocate a bio. As it's backed by a bioset, it'll - * never fail. We're returning a bio right now but you can call btrfs_io_bio - * for the appropriate container_of magic + * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. + * + * The bio allocation is backed by bioset and does not fail. */ -struct bio *btrfs_bio_alloc(u64 first_byte) +struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); - bio->bi_iter.bi_sector = first_byte >> 9; - btrfs_io_bio_init(btrfs_io_bio(bio)); + ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio)); return bio; } struct bio *btrfs_bio_clone(struct bio *bio) { - struct btrfs_io_bio *btrfs_bio; + struct btrfs_bio *bbio; struct bio *new; /* Bio allocation backed by a bioset does not fail */ new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); - btrfs_bio = btrfs_io_bio(new); - btrfs_io_bio_init(btrfs_bio); - btrfs_bio->iter = bio->bi_iter; + bbio = btrfs_bio(new); + btrfs_bio_init(bbio); + bbio->iter = bio->bi_iter; return new; } -struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) -{ - struct bio *bio; - - /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); - btrfs_io_bio_init(btrfs_io_bio(bio)); - return bio; -} - struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; - struct btrfs_io_bio *btrfs_bio; + struct btrfs_bio *bbio; ASSERT(offset <= UINT_MAX && size <= UINT_MAX); @@ -3169,11 +3176,11 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); - btrfs_bio = btrfs_io_bio(bio); - btrfs_io_bio_init(btrfs_bio); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio); bio_trim(bio, offset >> 9, size >> 9); - btrfs_bio->iter = bio->bi_iter; + bbio->iter = bio->bi_iter; return bio; } @@ -3307,14 +3314,15 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct bio *bio; int ret; + bio = btrfs_bio_alloc(BIO_MAX_VECS); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ if (bio_flags & EXTENT_BIO_COMPRESSED) - bio = btrfs_bio_alloc(disk_bytenr); + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else - bio = btrfs_bio_alloc(disk_bytenr + offset); + bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; bio_ctrl->bio_flags = bio_flags; bio->bi_end_io = end_io_func; @@ -3327,7 +3335,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, if (wbc) { struct block_device *bdev; - bdev = fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_dev->bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); } @@ -3341,7 +3349,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, goto error; } - btrfs_io_bio(bio)->device = device; + btrfs_bio(bio)->device = device; } return 0; error: @@ -3599,6 +3607,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; + ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { struct extent_state *cached = NULL; @@ -3777,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc, */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, - u64 delalloc_start, unsigned long *nr_written) + unsigned long *nr_written) { - u64 page_end = delalloc_start + PAGE_SIZE - 1; - bool found; + const u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 delalloc_start = page_offset(page); u64 delalloc_to_write = 0; - u64 delalloc_end = 0; int ret; int page_started = 0; + while (delalloc_start < page_end) { + u64 delalloc_end = page_end; + bool found; - while (delalloc_end < page_end) { found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, &delalloc_end); @@ -3854,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, struct page *page, u64 *start, u64 *end) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; /* Declare as unsigned long so we can use bitmap ops */ - unsigned long dirty_bitmap; unsigned long flags; - int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; - int range_start_bit = nbits; + int range_start_bit; int range_end_bit; /* @@ -3872,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, return; } + range_start_bit = spi->dirty_offset + + (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + /* We should have the page locked, but just in case */ spin_lock_irqsave(&subpage->lock, flags); - dirty_bitmap = subpage->dirty_bitmap; + bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, + spi->dirty_offset + spi->bitmap_nr_bits); spin_unlock_irqrestore(&subpage->lock, flags); - bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, - BTRFS_SUBPAGE_BITMAP_SIZE); + range_start_bit -= spi->dirty_offset; + range_end_bit -= spi->dirty_offset; + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; *end = page_offset(page) + range_end_bit * fs_info->sectorsize; } @@ -4054,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { struct inode *inode = page->mapping->host; - u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; @@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!epd->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, - &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written); if (ret == 1) return 0; if (ret) @@ -4141,8 +4155,20 @@ done: * capable of that. */ if (PageError(page)) - end_extent_writepage(page, ret, start, page_end); - unlock_page(page); + end_extent_writepage(page, ret, page_start, page_end); + if (epd->extent_locked) { + /* + * If epd->extent_locked, it's from extent_write_locked_range(), + * the page can either be locked by lock_page() or + * process_one_page(). + * Let btrfs_page_unlock_writer() handle both cases. + */ + ASSERT(wbc); + btrfs_page_unlock_writer(fs_info, page, wbc->range_start, + wbc->range_end + 1 - wbc->range_start); + } else { + unlock_page(page); + } ASSERT(ret <= 0); return ret; } @@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { + if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) + btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4602,12 +4631,11 @@ static int submit_eb_subpage(struct page *page, int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; - const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; int ret; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < nbits) { + while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; struct extent_buffer *eb; unsigned long flags; @@ -4623,7 +4651,8 @@ static int submit_eb_subpage(struct page *page, break; } spin_lock_irqsave(&subpage->lock, flags); - if (!((1 << bit_start) & subpage->dirty_bitmap)) { + if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&page->mapping->private_lock); bit_start++; @@ -4756,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, free_extent_buffer(eb); return ret; } - if (cache) + if (cache) { + /* Impiles write in zoned mode */ btrfs_put_block_group(cache); + /* Mark the last eb in a block group */ + if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) + set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4873,7 +4907,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. */ - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (!BTRFS_FS_ERROR(fs_info)) { ret = flush_write_bio(&epd); } else { ret = -EROFS; @@ -5069,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) return ret; } -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - int mode) +/* + * Submit the pages in the range to bio for call sites which delalloc range has + * already been ran (aka, ordered extent inserted) and all pages are still + * locked. + */ +int extent_write_locked_range(struct inode *inode, u64 start, u64 end) { + bool found_error = false; + int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long nr_pages = (end - start + PAGE_SIZE) >> - PAGE_SHIFT; - + u64 cur = start; + unsigned long nr_pages; + const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; struct extent_page_data epd = { .bio_ctrl = { 0 }, .extent_locked = 1, - .sync_io = mode == WB_SYNC_ALL, + .sync_io = 1, }; struct writeback_control wbc_writepages = { - .sync_mode = mode, - .nr_to_write = nr_pages * 2, + .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end + 1, /* We're called from an async helper function */ @@ -5093,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, .no_cgroup_owner = 1, }; + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); + nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> + PAGE_SHIFT; + wbc_writepages.nr_to_write = nr_pages * 2; + wbc_attach_fdatawrite_inode(&wbc_writepages, inode); - while (start <= end) { - page = find_get_page(mapping, start >> PAGE_SHIFT); - if (clear_page_dirty_for_io(page)) - ret = __extent_writepage(page, &wbc_writepages, &epd); - else { - btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), - page, start, start + PAGE_SIZE - 1, true); - unlock_page(page); + while (cur <= end) { + u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + + page = find_get_page(mapping, cur >> PAGE_SHIFT); + /* + * All pages in the range are locked since + * btrfs_run_delalloc_range(), thus there is no way to clear + * the page dirty flag. + */ + ASSERT(PageLocked(page)); + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + ret = __extent_writepage(page, &wbc_writepages, &epd); + ASSERT(ret <= 0); + if (ret < 0) { + found_error = true; + first_error = ret; } put_page(page); - start += PAGE_SIZE; + cur = cur_end + 1; } - ASSERT(ret <= 0); - if (ret == 0) + if (!found_error) ret = flush_write_bio(&epd); else end_write_bio(&epd, ret); wbc_detach_inode(&wbc_writepages); + if (found_error) + return first_error; return ret; } int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct inode *inode = mapping->host; + const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root); + const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info); int ret = 0; struct extent_page_data epd = { .bio_ctrl = { 0 }, @@ -5127,7 +5184,15 @@ int extent_writepages(struct address_space *mapping, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + /* + * Allow only a single thread to do the reloc work in zoned mode to + * protect the write pointer updates. + */ + if (data_reloc && zoned) + btrfs_inode_lock(inode, 0); ret = extent_write_cache_pages(mapping, wbc, &epd); + if (data_reloc && zoned) + btrfs_inode_unlock(inode, 0); ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); @@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - ret = btrfs_alloc_subpage(fs_info, &prealloc, - BTRFS_SUBPAGE_METADATA); - if (ret < 0) { - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; + if (fs_info->sectorsize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + ret = PTR_ERR(prealloc); + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } } spin_lock(&mapping->private_lock); @@ -7167,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; u64 page_start = page_offset(page); - int ret; - int i; + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); - ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); lockdep_assert_held(&fs_info->buffer_lock); - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, - bytenr >> fs_info->sectorsize_bits, - PAGE_SIZE / fs_info->nodesize); - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) - break; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - break; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } +out: return found; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 53abdc280451..0399cf8e3c32 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -32,6 +32,7 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, + EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - int mode); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); -struct bio *btrfs_bio_alloc(u64 first_byte); -struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); +struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4a8e02f7b6c7..5a36add21305 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) int i; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; set_extent_bits_nowait(&device->alloc_state, stripe->physical, @@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) int i; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 0b9401a5afd3..d1cbb64a78f3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If * NULL, the checksum buffer is allocated and returned in - * btrfs_io_bio(bio)->csum instead. + * btrfs_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + struct btrfs_bio *bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum = kmalloc_array(nblocks, csum_size, - GFP_NOFS); - if (!btrfs_bio->csum) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } } else { - btrfs_bio->csum = btrfs_bio->csum_inline; + bbio->csum = bbio->csum_inline; } - csum = btrfs_bio->csum; + csum = bbio->csum; } else { csum = dst; } @@ -709,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, index = 0; } - data = kmap_atomic(bvec.bv_page); - crypto_shash_digest(shash, data + bvec.bv_offset - + (i * fs_info->sectorsize), + data = bvec_kmap_local(&bvec); + crypto_shash_digest(shash, + data + (i * fs_info->sectorsize), fs_info->sectorsize, sums->sums + index); - kunmap_atomic(data); + kunmap_local(data); index += fs_info->csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ff577005d0f..9a3db1365ae8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, /* * unlocks pages after btrfs_file_write is done with them */ -static void btrfs_drop_pages(struct page **pages, size_t num_pages) +static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, + struct page **pages, size_t num_pages, + u64 pos, u64 copied) { size_t i; + u64 block_start = round_down(pos, fs_info->sectorsize); + u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; + + ASSERT(block_len <= U32_MAX); for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty @@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - ClearPageChecked(pages[i]); + btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, + block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct page *p = pages[i]; btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - ClearPageChecked(p); + btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); } @@ -734,8 +741,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root == fs_info->tree_root); + update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -870,7 +876,8 @@ next_slot: btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, - args->start - extent_offset); + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } @@ -956,7 +963,8 @@ delete_extent_item: btrfs_init_data_ref(&ref, root->root_key.objectid, key.objectid, - key.offset - extent_offset); + key.offset - extent_offset, 0, + false); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ args->bytes_found += extent_end - key.offset; @@ -1021,8 +1029,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &args->extent_item_size, 1); + btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size); args->extent_inserted = true; } @@ -1233,7 +1240,7 @@ again: btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, num_bytes, 0); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset); + orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1258,7 +1265,8 @@ again: other_end = 0; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, 0); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, + 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -1845,7 +1853,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); break; } @@ -1853,7 +1861,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); cond_resched(); @@ -2013,7 +2021,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) + if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2621,7 +2629,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->disk_len, 0); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset); + btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2704,14 +2712,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, drop_args.bytes_found); if (ret != -ENOSPC) { /* - * When cloning we want to avoid transaction aborts when - * nothing was done and we are attempting to clone parts - * of inline extents, in such cases -EOPNOTSUPP is - * returned by __btrfs_drop_extents() without having - * changed anything in the file. + * The only time we don't want to abort is if we are + * attempting to clone a partial inline extent, in which + * case we'll get EOPNOTSUPP. However if we aren't + * clone we need to abort no matter what, because if we + * got EOPNOTSUPP via prealloc then we messed up and + * need to abort. */ - if (extent_info && !extent_info->is_new_extent && - ret && ret != -EOPNOTSUPP) + if (ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index da0eee7c9e5f..f3fee88c8ee0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -22,6 +22,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "discard.h" +#include "subpage.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K @@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - ClearPageChecked(io_ctl->pages[i]); + btrfs_page_clear_checked(io_ctl->fs_info, + io_ctl->pages[i], + page_offset(io_ctl->pages[i]), + PAGE_SIZE); unlock_page(io_ctl->pages[i]); put_page(io_ctl->pages[i]); } @@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); + bool initial = (size == block_group->length); + u64 reclaimable_unusable; + + WARN_ON(!initial && offset + size > block_group->zone_capacity); spin_lock(&ctl->tree_lock); if (!used) to_free = size; + else if (initial) + to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) to_free = size; else if (offset + size <= block_group->alloc_offset) @@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, spin_unlock(&block_group->lock); } + reclaimable_unusable = block_group->zone_unusable - + (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && - block_group->zone_unusable >= - div_factor_fine(block_group->length, bg_reclaim_threshold)) { + reclaimable_unusable >= + div_factor_fine(block_group->zone_capacity, + bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } @@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, * out the free space after the allocation offset. */ if (btrfs_is_zoned(fs_info)) { - btrfs_info(fs_info, "free space %llu", - block_group->length - block_group->alloc_offset); + btrfs_info(fs_info, "free space %llu active %d", + block_group->zone_capacity - block_group->alloc_offset, + block_group->zone_is_active); return; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 487533c35ddb..b8c911a4a320 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,6 +6,7 @@ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> +#include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = page_address(cpage); + kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -455,11 +457,10 @@ struct async_chunk { struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; - atomic_t *pending; + struct async_cow *async_cow; }; struct async_cow { - /* Number of chunks in flight; must be first in the structure */ atomic_t num_chunks; struct async_chunk chunks[]; }; @@ -490,9 +491,6 @@ static noinline int add_async_extent(struct async_chunk *cow, */ static inline bool inode_can_compress(struct btrfs_inode *inode) { - /* Subpage doesn't support compression yet */ - if (inode->root->fs_info->sectorsize < PAGE_SIZE) - return false; if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; @@ -514,6 +512,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, btrfs_ino(inode)); return 0; } + /* + * Special check for subpage. + * + * We lock the full page then run each delalloc range in the page, thus + * for the following case, we will hit some subpage specific corner case: + * + * 0 32K 64K + * | |///////| |///////| + * \- A \- B + * + * In above case, both range A and range B will try to unlock the full + * page [0, 64K), causing the one finished later will have page + * unlocked already, triggering various page lock requirement BUG_ON()s. + * + * So here we add an artificial limit that subpage compression can only + * if the range is fully page aligned. + * + * In theory we only need to ensure the first page is fully covered, but + * the tailing partial page will be locked until the full compression + * finishes, delaying the write of other range. + * + * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range + * first to prevent any submitted async extent to unlock the full page. + * By this, we can ensure for subpage case that only the last async_cow + * will unlock the full page. + */ + if (fs_info->sectorsize < PAGE_SIZE) { + if (!IS_ALIGNED(start, PAGE_SIZE) || + !IS_ALIGNED(end + 1, PAGE_SIZE)) + return 0; + } + /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; @@ -615,13 +645,24 @@ again: total_compressed = actual_end - start; /* - * skip compression for a small file range(<=blocksize) that + * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) goto cleanup_and_bail_uncompressed; + /* + * For subpage case, we require full page alignment for the sector + * aligned range. + * Thus we must also check against @actual_end, not just @end. + */ + if (blocksize < PAGE_SIZE) { + if (!IS_ALIGNED(start, PAGE_SIZE) || + !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + goto cleanup_and_bail_uncompressed; + } + total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -759,7 +800,7 @@ cont: * win, compare the page count read with the blocks on disk, * compression must free at least one sector size */ - total_in = ALIGN(total_in, PAGE_SIZE); + total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize <= total_in) { compressed_extents++; @@ -840,166 +881,148 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -/* - * phase two of compressed writeback. This is the ordered portion - * of the code, which only gets called in the order the work was - * queued. We walk all the async extents created by compress_file_range - * and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) +static int submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { - struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_extent *async_extent; - u64 alloc_hint = 0; - struct btrfs_key ins; - struct extent_map *em; - struct btrfs_root *root = inode->root; - struct extent_io_tree *io_tree = &inode->io_tree; - int ret = 0; - -again: - while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - -retry: - lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1); - /* did the compression code fall back to uncompressed IO? */ - if (!async_extent->pages) { - int page_started = 0; - unsigned long nr_written = 0; + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; + unsigned long nr_written = 0; + int page_started = 0; + int ret; - /* allocate blocks */ - ret = cow_file_range(inode, async_chunk->locked_page, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0); + /* + * Call cow_file_range() to run the delalloc range directly, since we + * won't go to NOCOW or async path again. + * + * Also we call cow_file_range() with @unlock_page == 0, so that we + * can directly submit them without interruption. + */ + ret = cow_file_range(inode, locked_page, start, end, &page_started, + &nr_written, 0); + /* Inline extent inserted, page gets unlocked and everything is done */ + if (page_started) { + ret = 0; + goto out; + } + if (ret < 0) { + if (locked_page) + unlock_page(locked_page); + goto out; + } - /* JDM XXX */ + ret = extent_write_locked_range(&inode->vfs_inode, start, end); + /* All pages will be unlocked, including @locked_page */ +out: + kfree(async_extent); + return ret; +} - /* - * if page_started, cow_file_range inserted an - * inline extent and took care of all the unlocking - * and IO for us. Otherwise, we need to submit - * all those pages down to the drive. - */ - if (!page_started && !ret) - extent_write_locked_range(&inode->vfs_inode, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - WB_SYNC_ALL); - else if (ret && async_chunk->locked_page) - unlock_page(async_chunk->locked_page); - kfree(async_extent); - cond_resched(); - continue; - } +static int submit_one_async_extent(struct btrfs_inode *inode, + struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) +{ + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key ins; + struct page *locked_page = NULL; + struct extent_map *em; + int ret = 0; + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; - ret = btrfs_reserve_extent(root, async_extent->ram_size, - async_extent->compressed_size, - async_extent->compressed_size, - 0, alloc_hint, &ins, 1, 1); - if (ret) { - free_async_extent_pages(async_extent); + /* + * If async_chunk->locked_page is in the async_extent range, we need to + * handle it. + */ + if (async_chunk->locked_page) { + u64 locked_page_start = page_offset(async_chunk->locked_page); + u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; - if (ret == -ENOSPC) { - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + if (!(start >= locked_page_end || end <= locked_page_start)) + locked_page = async_chunk->locked_page; + } + lock_extent(io_tree, start, end); - /* - * we need to redirty the pages if we decide to - * fallback to uncompressed IO, otherwise we - * will not submit these pages down to lower - * layers. - */ - extent_range_redirty_for_io(&inode->vfs_inode, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + /* We have fall back to uncompressed write */ + if (!async_extent->pages) + return submit_uncompressed_range(inode, async_extent, locked_page); - goto retry; - } - goto out_free; - } + ret = btrfs_reserve_extent(root, async_extent->ram_size, + async_extent->compressed_size, + async_extent->compressed_size, + 0, *alloc_hint, &ins, 1, 1); + if (ret) { + free_async_extent_pages(async_extent); /* - * here we're doing allocation and writeback of the - * compressed pages + * Here we used to try again by going back to non-compressed + * path for ENOSPC. But we can't reserve space even for + * compressed size, how could it work for uncompressed size + * which requires larger size? So here we directly go error + * path. */ - em = create_io_em(inode, async_extent->start, - async_extent->ram_size, /* len */ - async_extent->start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); - if (IS_ERR(em)) - /* ret value is not necessary due to void function */ - goto out_free_reserve; - free_extent_map(em); - - ret = btrfs_add_ordered_extent_compress(inode, - async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - async_extent->compress_type); - if (ret) { - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); - goto out_free_reserve; - } - btrfs_dec_block_group_reservations(fs_info, ins.objectid); + goto out_free; + } + + /* Here we're doing allocation and writeback of the compressed pages */ + em = create_io_em(inode, start, + async_extent->ram_size, /* len */ + start, /* orig_start */ + ins.objectid, /* block_start */ + ins.offset, /* block_len */ + ins.offset, /* orig_block_len */ + async_extent->ram_size, /* ram_bytes */ + async_extent->compress_type, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserve; + } + free_extent_map(em); - /* - * clear dirty, set writeback and unlock the pages. - */ - extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, - PAGE_UNLOCK | PAGE_START_WRITEBACK); - if (btrfs_submit_compressed_write(inode, async_extent->start, - async_extent->ram_size, - ins.objectid, - ins.offset, async_extent->pages, - async_extent->nr_pages, - async_chunk->write_flags, - async_chunk->blkcg_css)) { - struct page *p = async_extent->pages[0]; - const u64 start = async_extent->start; - const u64 end = start + async_extent->ram_size - 1; - - p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(inode, p, start, - end, false); - - p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, NULL, 0, - PAGE_END_WRITEBACK | - PAGE_SET_ERROR); - free_async_extent_pages(async_extent); - } - alloc_hint = ins.objectid + ins.offset; - kfree(async_extent); - cond_resched(); + ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ + ins.objectid, /* disk_bytenr */ + async_extent->ram_size, /* num_bytes */ + ins.offset, /* disk_num_bytes */ + async_extent->compress_type); + if (ret) { + btrfs_drop_extent_cache(inode, start, end, 0); + goto out_free_reserve; } - return; + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + /* Clear dirty, set writeback and unlock the pages. */ + extent_clear_unlock_delalloc(inode, start, end, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK); + if (btrfs_submit_compressed_write(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* compressed_len */ + async_extent->pages, /* compressed_pages */ + async_extent->nr_pages, + async_chunk->write_flags, + async_chunk->blkcg_css)) { + const u64 start = async_extent->start; + const u64 end = start + async_extent->ram_size - 1; + + btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0); + + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, + PAGE_END_WRITEBACK | PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + } + *alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + return ret; + out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: - extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, @@ -1007,7 +1030,39 @@ out_free: PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); - goto again; + return ret; +} + +/* + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. + */ +static noinline void submit_compressed_extents(struct async_chunk *async_chunk) +{ + struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct async_extent *async_extent; + u64 alloc_hint = 0; + int ret = 0; + + while (!list_empty(&async_chunk->extents)) { + u64 extent_start; + u64 ram_size; + + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + extent_start = async_extent->start; + ram_size = async_extent->ram_size; + + ret = submit_one_async_extent(inode, async_chunk, async_extent, + &alloc_hint); + btrfs_debug(fs_info, +"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", + inode->root->root_key.objectid, + btrfs_ino(inode), extent_start, ram_size, ret); + } } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, @@ -1150,7 +1205,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * fails during the stage where it updates the bytenr of file extent * items. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; @@ -1186,8 +1241,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_drop_extent_cache; - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); /* @@ -1325,18 +1379,17 @@ static noinline void async_cow_submit(struct btrfs_work *work) static noinline void async_cow_free(struct btrfs_work *work) { struct async_chunk *async_chunk; + struct async_cow *async_cow; async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); - /* - * Since the pointer to 'pending' is at the beginning of the array of - * async_chunk's, freeing it ensures the whole array has been freed. - */ - if (atomic_dec_and_test(async_chunk->pending)) - kvfree(async_chunk->pending); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); } static int cow_file_range_async(struct btrfs_inode *inode, @@ -1397,7 +1450,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, * lightweight reference for the callback lifetime */ ihold(&inode->vfs_inode); - async_chunk[i].pending = &ctx->num_chunks; + async_chunk[i].async_cow = ctx; async_chunk[i].inode = &inode->vfs_inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; @@ -1470,7 +1523,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, __set_page_dirty_nobuffers(locked_page); account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); + extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1503,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, int *page_started, unsigned long *nr_written) { const bool is_space_ino = btrfs_is_free_space_inode(inode); - const bool is_reloc_ino = (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID); + const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; @@ -1866,8 +1918,7 @@ out_check: btrfs_dec_nocow_writers(fs_info, disk_bytenr); nocow = false; - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler @@ -1946,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page int ret; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + /* + * The range must cover part of the @locked_page, or the returned + * @page_started can confuse the caller. + */ + ASSERT(!(end <= page_offset(locked_page) || + start >= page_offset(locked_page) + PAGE_SIZE)); + if (should_nocow(inode, start, end)) { - ASSERT(!zoned); + /* + * Normally on a zoned device we're only doing COW writes, but + * in case of relocation on a zoned filesystem we have taken + * precaution, that we're only writing sequentially. It's safe + * to use run_delalloc_nocow() here, like for regular + * preallocated inodes. + */ + ASSERT(!zoned || + (zoned && btrfs_is_data_reloc_root(inode->root))); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); } else if (!inode_can_compress(inode) || @@ -2206,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2234,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } /* - * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit - * in a chunk's stripe. This function ensures that bios do not span a - * stripe/chunk - * - * @page - The page we are about to add to the bio - * @size - size we want to add to the bio - * @bio - bio we want to ensure is smaller than a stripe - * @bio_flags - flags of the bio - * - * return 1 if page cannot be added to the bio - * return 0 if page can be added to the bio - * return error otherwise - */ -int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, - unsigned long bio_flags) -{ - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 logical = bio->bi_iter.bi_sector << 9; - u32 bio_len = bio->bi_iter.bi_size; - struct extent_map *em; - int ret = 0; - struct btrfs_io_geometry geom; - - if (bio_flags & EXTENT_BIO_COMPRESSED) - return 0; - - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); - if (ret < 0) - goto out; - - if (geom.len < bio_len + size) - ret = 1; -out: - free_extent_map(em); - return ret; -} - -/* * in order to insert checksums into the metadata in large chunks, * we wait until bio submission time. All the pages in the bio are * checksummed and sums are attached onto the ordered extent record. @@ -2531,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, goto mapit; } else if (async && !skip_sum) { /* csum items have already been cloned */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, @@ -2764,7 +2788,7 @@ out_page: clear_page_dirty_for_io(page); SetPageError(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -2819,7 +2843,7 @@ int btrfs_writepage_cow_fixup(struct page *page) * page->mapping outside of the page lock. */ ihold(inode); - SetPageChecked(page); + btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; @@ -3010,8 +3034,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (ordered_extent->bdev) + /* A valid bdev implies a write on a sequential zone */ + if (ordered_extent->bdev) { btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + } btrfs_free_io_failure_record(inode, start, end); @@ -3208,7 +3236,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, * * The length of such check is always one sector size. */ -static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, +static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff, u64 start) { @@ -3224,7 +3252,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, ASSERT(pgoff + len <= PAGE_SIZE); offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; + csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; @@ -3238,9 +3266,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, return 0; zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - io_bio->mirror_num); - if (io_bio->device) - btrfs_dev_stat_inc_and_print(io_bio->device, + bbio->mirror_num); + if (bbio->device) + btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -3260,33 +3288,29 @@ zeroit: * Return a bitmap where bit set means a csum mismatch, and bit not set means * csum match. */ -unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; - if (PageChecked(page)) { - ClearPageChecked(page); + if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { + btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); return 0; } /* - * For subpage case, above PageChecked is not safe as it's not subpage - * compatible. - * But for now only cow fixup and compressed read utilize PageChecked - * flag, while in this context we can easily use io_bio->csum to - * determine if we really need to do csum verification. - * - * So for now, just exit if io_bio->csum is NULL, as it means it's - * compressed read, and its compressed data csum has already been - * verified. + * This only happens for NODATASUM or compressed read. + * Normally this should be covered by above check for compressed read + * or the next check for NODATASUM. Just do a quicker exit here. */ - if (io_bio->csum == NULL) + if (bbio->csum == NULL) return 0; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) @@ -3303,7 +3327,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, u64 file_offset = pg_off + page_offset(page); int ret; - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + if (btrfs_is_data_reloc_root(root) && test_range_bit(io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM, 1, NULL)) { @@ -3313,7 +3337,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, + ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, page_offset(page) + pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> @@ -4004,7 +4028,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * without delay */ if (!btrfs_is_free_space_inode(inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !btrfs_is_data_reloc_root(root) && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); @@ -4034,11 +4058,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, * also drops the back refs in the inode to the directory */ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { + struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; @@ -4098,19 +4122,9 @@ skip_backref: goto err; } - ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - goto err; - } - - ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, - index); - if (ret == -ENOENT) - ret = 0; - else if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); /* * If we have a pending delayed iput we could end up with the final iput @@ -4138,15 +4152,14 @@ out: } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { int ret; - ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode->root, inode); } return ret; } @@ -4175,7 +4188,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { - struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; @@ -4187,7 +4199,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (ret) @@ -4201,7 +4213,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(root->fs_info); + btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return ret; } @@ -4368,7 +4380,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) struct inode *inode; u64 objectid = 0; - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); @@ -4552,7 +4564,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int err = 0; - struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; u64 last_unlink_trans; @@ -4577,7 +4588,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (!err) { @@ -4598,7 +4609,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(root->fs_info); + btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return err; } @@ -4907,9 +4918,9 @@ delete: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, extent_start, extent_num_bytes, 0); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - ino, extent_offset); + ino, extent_offset, + root->root_key.objectid, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -5105,7 +5116,8 @@ again: len); flush_dcache_page(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, block_start, + block_end + 1 - block_start); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -6435,7 +6447,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; - int nitems = name ? 2 : 1; + struct btrfs_item_batch batch; unsigned long ptr; unsigned int nofs_flag; int ret; @@ -6527,7 +6539,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); + batch.keys = &key[0]; + batch.data_sizes = &sizes[0]; + batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); + batch.nr = name ? 2 : 1; + ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret != 0) goto fail_unlock; @@ -7961,7 +7977,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->type = IOMAP_MAPPED; } iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) @@ -8038,13 +8054,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->logical_offset, + dip->file_offset, dip->bytes, !dip->dio_bio->bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, - dip->logical_offset, - dip->logical_offset + dip->bytes - 1); + dip->file_offset, + dip->file_offset + dip->bytes - 1); } bio_endio(dip->dio_bio); @@ -8072,10 +8088,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, return ret; } -static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, - struct btrfs_io_bio *io_bio, +static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, + struct btrfs_bio *bbio, const bool uptodate) { + struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -8083,11 +8100,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - u64 start = io_bio->logical; + const u64 orig_file_offset = dip->file_offset; + u64 start = orig_file_offset; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; - __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { + __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { unsigned int i, nr_sectors, pgoff; nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); @@ -8095,7 +8113,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); if (uptodate && - (!csum || !check_data_csum(inode, io_bio, + (!csum || !check_data_csum(inode, bbio, bio_offset, bvec.bv_page, pgoff, start))) { clean_io_failure(fs_info, failure_tree, io_tree, @@ -8105,12 +8123,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, } else { int ret; - ASSERT((start - io_bio->logical) < UINT_MAX); + ASSERT((start - orig_file_offset) < UINT_MAX); ret = btrfs_repair_one_sector(inode, - &io_bio->bio, - start - io_bio->logical, + &bbio->bio, + start - orig_file_offset, bvec.bv_page, pgoff, - start, io_bio->mirror_num, + start, bbio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); @@ -8151,15 +8169,13 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); - if (bio_op(bio) == REQ_OP_READ) { - err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), - !err); - } + if (bio_op(bio) == REQ_OP_READ) + err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); if (err) dip->dio_bio->bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); + btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8201,10 +8217,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, } else { u64 csum_offset; - csum_offset = file_offset - dip->logical_offset; + csum_offset = file_offset - dip->file_offset; csum_offset >>= fs_info->sectorsize_bits; csum_offset *= fs_info->csum_size; - btrfs_io_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = dip->csums + csum_offset; } map: ret = btrfs_map_bio(fs_info, bio, 0); @@ -8239,7 +8255,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return NULL; dip->inode = inode; - dip->logical_offset = file_offset; + dip->file_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; @@ -8247,7 +8263,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, +static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { struct inode *inode = iter->inode; @@ -8277,7 +8293,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, } dio_bio->bi_status = BLK_STS_RESOURCE; bio_endio(dio_bio); - return BLK_QC_T_NONE; + return; } if (!write) { @@ -8320,7 +8336,6 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, @@ -8371,15 +8386,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, free_extent_map(em); } while (submit_len > 0); - return BLK_QC_T_NONE; + return; out_err_em: free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); - - return BLK_QC_T_NONE; } const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8696,9 +8709,9 @@ next: * did something wrong. */ ASSERT(!PageOrdered(page)); + btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8842,7 +8855,7 @@ again: memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); @@ -9152,8 +9165,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) WARN_ON(inode->block_rsv.reserved); WARN_ON(inode->block_rsv.size); WARN_ON(inode->outstanding_extents); - WARN_ON(inode->delalloc_bytes); - WARN_ON(inode->new_delalloc_bytes); + if (!S_ISDIR(vfs_inode->i_mode)) { + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + } WARN_ON(inode->csum_bytes); WARN_ON(inode->defrag_bytes); @@ -9450,7 +9465,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* src is an inode */ - ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, old_dentry->d_name.len); @@ -9466,7 +9481,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); } else { /* dest is an inode */ - ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, new_dentry->d_name.len); @@ -9741,7 +9756,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, */ btrfs_pin_log_trans(root); log_pinned = true; - ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, old_dentry->d_name.len); @@ -9761,7 +9776,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { - ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), new_dentry->d_name.name, new_dentry->d_name.len); @@ -9979,7 +9994,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte }; struct btrfs_fs_info *fs_info = root->fs_info; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); @@ -9998,7 +10013,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, struct list_head splice; int ret; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; INIT_LIST_HEAD(&splice); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cc61813213d8..02ff085c31bf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -48,6 +48,7 @@ #include "space-info.h" #include "delalloc-space.h" #include "block-group.h" +#include "subpage.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 { compat_uptr_t clone_sources; /* in */ __u64 parent_root; /* in */ __u64 flags; /* in */ - __u64 reserved[4]; /* in */ + __u32 version; /* in */ + __u8 reserved[28]; /* in */ } __attribute__ ((__packed__)); #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ @@ -985,129 +987,32 @@ out: return ret; } -/* - * When we're defragging a range, we don't want to kick it off again - * if it is really just waiting for delalloc to send it down. - * If we find a nice big extent or delalloc range for the bytes in the - * file you want to defrag, we return 0 to let you know to skip this - * part of the file - */ -static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) -{ - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 end; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); - read_unlock(&em_tree->lock); - - if (em) { - end = extent_map_end(em); - free_extent_map(em); - if (end - offset > thresh) - return 0; - } - /* if we already have a nice delalloc here, just stop */ - thresh /= 2; - end = count_range_bits(io_tree, &offset, offset + thresh, - thresh, EXTENT_DELALLOC, 1); - if (end >= thresh) - return 0; - return 1; -} - -/* - * helper function to walk through a file and find extents - * newer than a specific transid, and smaller than thresh. - * - * This is used by the defragging code to find new and small - * extents - */ -static int find_new_extents(struct btrfs_root *root, - struct inode *inode, u64 newer_than, - u64 *off, u32 thresh) -{ - struct btrfs_path *path; - struct btrfs_key min_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - int type; - int ret; - u64 ino = btrfs_ino(BTRFS_I(inode)); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - min_key.objectid = ino; - min_key.type = BTRFS_EXTENT_DATA_KEY; - min_key.offset = *off; - - while (1) { - ret = btrfs_search_forward(root, &min_key, path, newer_than); - if (ret != 0) - goto none; -process_slot: - if (min_key.objectid != ino) - goto none; - if (min_key.type != BTRFS_EXTENT_DATA_KEY) - goto none; - - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG && - btrfs_file_extent_num_bytes(leaf, extent) < thresh && - check_defrag_in_cache(inode, min_key.offset, thresh)) { - *off = min_key.offset; - btrfs_free_path(path); - return 0; - } - - path->slots[0]++; - if (path->slots[0] < btrfs_header_nritems(leaf)) { - btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); - goto process_slot; - } - - if (min_key.offset == (u64)-1) - goto none; - - min_key.offset++; - btrfs_release_path(path); - } -none: - btrfs_free_path(path); - return -ENOENT; -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; - u64 len = PAGE_SIZE; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; /* * hopefully we have this extent in the tree already, try without * the full extent lock */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); if (!em) { struct extent_state *cached = NULL; - u64 end = start + len - 1; + u64 end = start + sectorsize - 1; /* get the big lock and read metadata off disk */ - lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); - unlock_extent_cached(io_tree, start, end, &cached); + if (!locked) + lock_extent_bits(io_tree, start, end, &cached); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + if (!locked) + unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) return em; } -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + bool locked) { struct extent_map *next; bool ret = true; @@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len); + next = defrag_lookup_extent(inode, em->start + em->len, locked); if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && @@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return ret; } -static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, - u64 *last_len, u64 *skip, u64 *defrag_end, - int compress) +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, + pgoff_t index) { - struct extent_map *em; - int ret = 1; - bool next_mergeable = true; - bool prev_mergeable = true; + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); /* - * make sure that once we start defragging an extent, we keep on - * defragging it + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. */ - if (start < *defrag_end) - return 1; + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } - *skip = 0; + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } - em = defrag_lookup_extent(inode, start); - if (!em) - return 0; + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; - /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - ret = 0; - goto out; - } + lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent_cached(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; - if (!*defrag_end) - prev_mergeable = false; + unlock_page(page); + btrfs_start_ordered_extent(ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } - next_mergeable = defrag_check_next_extent(inode, em); - /* - * we hit a real extent, if it is big or the next extent is not a - * real extent, don't bother defragging it - */ - if (!compress && (*last_len == 0 || *last_len >= thresh) && - (em->len >= thresh || (!next_mergeable && !prev_mergeable))) - ret = 0; -out: /* - * last_len ends up being a counter of how many bytes we've defragged. - * every time we choose not to defrag an extent, we reset *last_len - * so that the next tiny extent will force a defrag. - * - * The end result of this is that tiny extents before a single big - * extent will force at least part of that big extent to be defragged. + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. */ - if (ret) { - *defrag_end = extent_map_end(em); - } else { - *last_len = 0; - *skip = extent_map_end(em); - *defrag_end = 0; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } } - - free_extent_map(em); - return ret; + return page; } +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + /* - * it doesn't do much good to defrag one or two pages - * at a time. This pulls in a nice chunk of pages - * to COW and defrag. - * - * It also makes sure the delalloc code has enough - * dirty data to avoid making new small extents as part - * of the defrag + * Collect all valid target extents. * - * It's a good idea to start RA on this range - * before calling this. + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents */ -static int cluster_pages_for_defrag(struct inode *inode, - struct page **pages, - unsigned long start_index, - unsigned long num_pages) +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list) { - unsigned long file_end; - u64 isize = i_size_read(inode); - u64 page_start; - u64 page_end; - u64 page_cnt; - u64 start = (u64)start_index << PAGE_SHIFT; - u64 search_start; - int ret; - int i; - int i_done; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_io_tree *tree; - struct extent_changeset *data_reserved = NULL; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + u64 cur = start; + int ret = 0; - file_end = (isize - 1) >> PAGE_SHIFT; - if (!isize || start_index > file_end) - return 0; + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; - page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); + em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + if (!em) + break; - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - start, page_cnt << PAGE_SHIFT); - if (ret) - return ret; - i_done = 0; - tree = &BTRFS_I(inode)->io_tree; + /* Skip hole/inline/preallocated extents */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; - /* step one, lock all the pages */ - for (i = 0; i < page_cnt; i++) { - struct page *page; -again: - page = find_or_create_page(inode->i_mapping, - start_index + i, mask); - if (!page) - break; + /* Skip older extent */ + if (em->generation < newer_than) + goto next; - ret = set_page_extent_mapped(page); - if (ret < 0) { - unlock_page(page); - put_page(page); - break; + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (em->len >= extent_thresh) + goto next; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ } - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - while (1) { - lock_extent_bits(tree, page_start, page_end, - &cached_state); - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), - page_start); - unlock_extent_cached(tree, page_start, page_end, - &cached_state); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * we unlocked the page above, so we need check if - * it was released or not. - */ - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - goto again; +add: + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; } + /* Fall through to allocate a new entry */ } - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - ret = -EIO; - break; - } + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - goto again; +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); } + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; + + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, cached_state); + set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); - pages[i] = page; - i_done++; + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); } - if (!i_done || ret) - goto out; + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); - if (!(inode->i_sb->s_flags & SB_ACTIVE)) - goto out; + return ret; +} - /* - * so now we have a nice long stream of locked - * and up to date pages, lets wait on them - */ - for (i = 0; i < i_done; i++) - wait_on_page_writeback(pages[i]); +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; + + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - page_start = page_offset(pages[0]); - page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; - lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + /* Lock the pages range */ + lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); /* - * When defragmenting we skip ranges that have holes or inline extents, - * (check should_defrag_range()), to avoid unnecessary IO and wasting - * space. At btrfs_defrag_file(), we check if a range should be defragged - * before locking the inode and then, if it should, we trigger a sync - * page cache readahead - we lock the inode only after that to avoid - * blocking for too long other tasks that possibly want to operate on - * other file ranges. But before we were able to get the inode lock, - * some other task may have punched a hole in the range, or we may have - * now an inline extent, in which case we should not defrag. So check - * for that here, where we have the inode and the range locked, and bail - * out if that happened. + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. */ - search_start = page_start; - while (search_start < page_end) { - struct extent_map *em; + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list); + if (ret < 0) + goto unlock_extent; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, - page_end - search_start); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock_range; - } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - /* Ok, 0 means we did not defrag anything */ - ret = 0; - goto out_unlock_range; + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); } - search_start = extent_map_end(em); - free_extent_map(em); } + kfree(pages); + return ret; +} - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, &cached_state); +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; - if (i_done != page_cnt) { - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start, (page_cnt - i_done) << PAGE_SHIFT, true); - } + BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list); + if (ret < 0) + goto out; + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; - set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state); + /* Reached the limit */ + if (max_sectors && max_sectors == *sectors_defragged) + break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); - for (i = 0; i < i_done; i++) { - clear_page_dirty_for_io(pages[i]); - ClearPageChecked(pages[i]); - set_page_dirty(pages[i]); - unlock_page(pages[i]); - put_page(pages[i]); + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress); + if (ret < 0) + break; + *sectors_defragged += range_len; } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); - extent_changeset_free(data_reserved); - return i_done; - -out_unlock_range: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); out: - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - put_page(pages[i]); + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); } - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start, page_cnt << PAGE_SHIFT, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); - extent_changeset_free(data_reserved); return ret; - } -int btrfs_defrag_file(struct inode *inode, struct file *file, +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct file_ra_state *ra = NULL; - unsigned long last_index; + unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); - u64 last_len = 0; - u64 skip = 0; - u64 defrag_end = 0; - u64 newer_off = range->start; - unsigned long i; - unsigned long ra_index = 0; - int ret; - int defrag_count = 0; + u64 cur; + u64 last_byte; + bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; + bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; - unsigned long cluster = max_cluster; - u64 new_align = ~((u64)SZ_128K - 1); - struct page **pages = NULL; - bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; if (isize == 0) return 0; @@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (extent_thresh == 0) extent_thresh = SZ_256K; + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len) - 1; + } else { + /* Defrag until file end */ + last_byte = isize - 1; + } + /* - * If we were not given a file, allocate a readahead context. As + * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so * we don't error out. */ - if (!file) { + if (!ra) { + ra_allocated = true; ra = kzalloc(sizeof(*ra), GFP_KERNEL); if (ra) file_ra_state_init(ra, inode->i_mapping); - } else { - ra = &file->f_ra; - } - - pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out_ra; - } - - /* find the last page to defrag */ - if (range->start + range->len > range->start) { - last_index = min_t(u64, isize - 1, - range->start + range->len - 1) >> PAGE_SHIFT; - } else { - last_index = (isize - 1) >> PAGE_SHIFT; - } - - if (newer_than) { - ret = find_new_extents(root, inode, newer_than, - &newer_off, SZ_64K); - if (!ret) { - range->start = newer_off; - /* - * we always align our defrag to help keep - * the extents in the file evenly spaced - */ - i = (newer_off & new_align) >> PAGE_SHIFT; - } else - goto out_ra; - } else { - i = range->start >> PAGE_SHIFT; } - if (!max_to_defrag) - max_to_defrag = last_index - i + 1; - /* - * make writeback starts from i, so the defrag range can be - * written sequentially. - */ - if (i < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = i; - - while (i <= last_index && defrag_count < max_to_defrag && - (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { - /* - * make sure we stop running if someone unmounts - * the FS - */ - if (!(inode->i_sb->s_flags & SB_ACTIVE)) - break; - - if (btrfs_defrag_cancelled(fs_info)) { - btrfs_debug(fs_info, "defrag_file cancelled"); - ret = -EAGAIN; - goto error; - } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, - extent_thresh, &last_len, &skip, - &defrag_end, do_compress)){ - unsigned long next; - /* - * the should_defrag function tells us how much to skip - * bump our counter by the suggested amount - */ - next = DIV_ROUND_UP(skip, PAGE_SIZE); - i = max(i + 1, next); - continue; - } + while (cur < last_byte) { + u64 cluster_end; - if (!newer_than) { - cluster = (PAGE_ALIGN(defrag_end) >> - PAGE_SHIFT) - i; - cluster = min(cluster, max_cluster); - } else { - cluster = max_cluster; - } + /* The cluster size 256K should always be page aligned */ + BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - if (i + cluster > ra_index) { - ra_index = max(i, ra_index); - if (ra) - page_cache_sync_readahead(inode->i_mapping, ra, - file, ra_index, cluster); - ra_index += cluster; - } + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; - } else { - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + btrfs_inode_unlock(inode, 0); + break; } - if (ret < 0) { + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { btrfs_inode_unlock(inode, 0); - goto out_ra; + break; } - - defrag_count += ret; - balance_dirty_pages_ratelimited(inode->i_mapping); + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, + §ors_defragged, max_to_defrag); btrfs_inode_unlock(inode, 0); - - if (newer_than) { - if (newer_off == (u64)-1) - break; - - if (ret > 0) - i += ret; - - newer_off = max(newer_off + 1, - (u64)i << PAGE_SHIFT); - - ret = find_new_extents(root, inode, newer_than, - &newer_off, SZ_64K); - if (!ret) { - range->start = newer_off; - i = (newer_off & new_align) >> PAGE_SHIFT; - } else { - break; - } - } else { - if (ret > 0) { - i += ret; - last_len += ret << PAGE_SHIFT; - } else { - i++; - last_len = 0; - } - } + if (ret < 0) + break; + cur = cluster_end + 1; } - ret = defrag_count; -error: - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { - filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) + if (ra_allocated) + kfree(ra); + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; } - - if (range->compress_type == BTRFS_COMPRESS_LZO) { - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - } - -out_ra: if (do_compress) { btrfs_inode_lock(inode, 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); } - if (!file) - kfree(ra); - kfree(pages); return ret; } @@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size; @@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -1730,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } if (!strcmp(sizestr, "max")) - new_size = device->bdev->bd_inode->i_size; + new_size = bdev_nr_bytes(device->bdev); else { if (sizestr[0] == '-') { mod = -1; @@ -1771,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_finish; } - if (new_size > device->bdev->bd_inode->i_size) { + if (new_size > bdev_nr_bytes(device->bdev)) { ret = -EFBIG; goto out_finish; } @@ -3136,12 +3097,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } - /* Subpage defrag will be supported in later commits */ - if (root->fs_info->sectorsize < PAGE_SIZE) { - ret = -ENOTTY; - goto out; - } - switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { @@ -3176,7 +3131,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), file, + ret = btrfs_defrag_file(file_inode(file), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -3220,6 +3175,7 @@ out: static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; @@ -3231,35 +3187,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write_file(file); - if (ret) - return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto err_drop; + goto out; } if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && - strcmp("cancel", vol_args->name) == 0) + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + args.devid = vol_args->devid; + } else if (!strcmp("cancel", vol_args->name)) { cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret) - goto out; - /* Exclusive operation is now claimed */ + goto err_drop; - if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) - ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode); - else - ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); + /* Exclusive operation is now claimed */ + ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); btrfs_exclop_finish(fs_info); @@ -3271,17 +3231,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) btrfs_info(fs_info, "device deleted: %s", vol_args->name); } -out: - kfree(vol_args); err_drop: mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); return ret; } static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; @@ -3293,32 +3255,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write_file(file); - if (ret) - return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out_drop_write; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - cancel = (strcmp("cancel", vol_args->name) == 0); + if (!strcmp("cancel", vol_args->name)) { + cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); + ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } - kfree(vol_args); -out_drop_write: mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); return ret; } @@ -3379,22 +3347,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; int ret = 0; - char *s_uuid = NULL; di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); + args.devid = di_args->devid; if (!btrfs_is_empty_uuid(di_args->uuid)) - s_uuid = di_args->uuid; + args.uuid = di_args->uuid; rcu_read_lock(); - dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, - NULL); - + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { ret = -ENODEV; goto out; @@ -4430,7 +4397,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_quota_rescan_args qsa = {0}; - int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4441,9 +4407,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, } if (copy_to_user(arg, &qsa, sizeof(qsa))) - ret = -EFAULT; + return -EFAULT; - return ret; + return 0; } static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index a2e1f1f5c6e3..bbc45534ae9a 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG -static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { - lockdep_assert_held(&eb->lock); +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_write(&eb->lock); } #else -static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index c25dfd1a8a54..65cb0766e62d 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -32,19 +32,19 @@ * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. - * One segment represents at most one page of uncompressed data. + * One segment represents at most one sector of uncompressed data. * * 2.1 Segment header * Fixed size. LZO_LEN (4) bytes long, LE32. * Records the total size of the segment (not including the header). - * Segment header never crosses page boundary, thus it's possible to - * have at most 3 padding zeros at the end of the page. + * Segment header never crosses sector boundary, thus it's possible to + * have at most 3 padding zeros at the end of the sector. * * 2.2 Data Payload - * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE) - * which is 4419 for a 4KiB page. + * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize) + * which is 4419 for a 4KiB sectorsize. * - * Example: + * Example with 4K sectorsize: * Page 1: * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10 * 0x0000 | Header | SegHdr 01 | Data payload 01 ... | @@ -112,163 +112,174 @@ static inline size_t read_compress_length(const char *buf) return le32_to_cpu(dlen); } +/* + * Will do: + * + * - Write a segment header into the destination + * - Copy the compressed buffer into the destination + * - Make sure we have enough space in the last sector to fit a segment header + * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. + * + * Will allocate new pages when needed. + */ +static int copy_compressed_data_to_page(char *compressed_data, + size_t compressed_size, + struct page **out_pages, + u32 *cur_out, + const u32 sectorsize) +{ + u32 sector_bytes_left; + u32 orig_out; + struct page *cur_page; + char *kaddr; + + /* + * We never allow a segment header crossing sector boundary, previous + * run should ensure we have enough space left inside the sector. + */ + ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); + + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + + kaddr = kmap(cur_page); + write_compress_length(kaddr + offset_in_page(*cur_out), + compressed_size); + *cur_out += LZO_LEN; + + orig_out = *cur_out; + + /* Copy compressed data */ + while (*cur_out - orig_out < compressed_size) { + u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, + orig_out + compressed_size - *cur_out); + + kunmap(cur_page); + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + kaddr = kmap(cur_page); + + memcpy(kaddr + offset_in_page(*cur_out), + compressed_data + *cur_out - orig_out, copy_len); + + *cur_out += copy_len; + } + + /* + * Check if we can fit the next segment header into the remaining space + * of the sector. + */ + sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out; + if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) + goto out; + + /* The remaining size is not enough, pad it with zeros */ + memset(kaddr + offset_in_page(*cur_out), 0, + sector_bytes_left); + *cur_out += sector_bytes_left; + +out: + kunmap(cur_page); + return 0; +} + int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; + struct page *page_in = NULL; + char *sizes_ptr; int ret = 0; - char *data_in; - char *cpage_out, *sizes_ptr; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; - unsigned long bytes_left; - unsigned long len = *total_out; - unsigned long nr_dest_pages = *out_pages; - const unsigned long max_out = nr_dest_pages * PAGE_SIZE; - size_t in_len; - size_t out_len; - char *buf; - unsigned long tot_in = 0; - unsigned long tot_out = 0; - unsigned long pg_bytes_left; - unsigned long out_offset; - unsigned long bytes; + /* Points to the file offset of input data */ + u64 cur_in = start; + /* Points to the current output byte */ + u32 cur_out = 0; + u32 len = *total_out; *out_pages = 0; *total_out = 0; *total_in = 0; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); - /* - * store the size of all chunks of compressed data in - * the first 4 bytes + * Skip the header for now, we will later come back and write the total + * compressed size */ - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = page_address(out_page); - out_offset = LZO_LEN; - tot_out = LZO_LEN; - pages[0] = out_page; - nr_pages = 1; - pg_bytes_left = PAGE_SIZE - LZO_LEN; - - /* compress at most one page of data each time */ - in_len = min(len, PAGE_SIZE); - while (tot_in < len) { - ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, - &out_len, workspace->mem); - if (ret != LZO_E_OK) { - pr_debug("BTRFS: lzo in loop returned %d\n", - ret); + cur_out += LZO_LEN; + while (cur_in < start + len) { + char *data_in; + const u32 sectorsize_mask = sectorsize - 1; + u32 sector_off = (cur_in - start) & sectorsize_mask; + u32 in_len; + size_t out_len; + + /* Get the input page first */ + if (!page_in) { + page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); + ASSERT(page_in); + } + + /* Compress at most one sector of data each time */ + in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); + ASSERT(in_len); + data_in = kmap(page_in); + ret = lzo1x_1_compress(data_in + + offset_in_page(cur_in), in_len, + workspace->cbuf, &out_len, + workspace->mem); + kunmap(page_in); + if (ret < 0) { + pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; goto out; } - /* store the size of this chunk of compressed data */ - write_compress_length(cpage_out + out_offset, out_len); - tot_out += LZO_LEN; - out_offset += LZO_LEN; - pg_bytes_left -= LZO_LEN; - - tot_in += in_len; - tot_out += out_len; - - /* copy bytes from the working buffer into the pages */ - buf = workspace->cbuf; - while (out_len) { - bytes = min_t(unsigned long, pg_bytes_left, out_len); - - memcpy(cpage_out + out_offset, buf, bytes); - - out_len -= bytes; - pg_bytes_left -= bytes; - buf += bytes; - out_offset += bytes; - - /* - * we need another page for writing out. - * - * Note if there's less than 4 bytes left, we just - * skip to a new page. - */ - if ((out_len == 0 && pg_bytes_left < LZO_LEN) || - pg_bytes_left == 0) { - if (pg_bytes_left) { - memset(cpage_out + out_offset, 0, - pg_bytes_left); - tot_out += pg_bytes_left; - } - - /* we're done, don't allocate new page */ - if (out_len == 0 && tot_in >= len) - break; - - if (nr_pages == nr_dest_pages) { - out_page = NULL; - ret = -E2BIG; - goto out; - } - - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = page_address(out_page); - pages[nr_pages++] = out_page; - - pg_bytes_left = PAGE_SIZE; - out_offset = 0; - } - } + ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + pages, &cur_out, sectorsize); + if (ret < 0) + goto out; + + cur_in += in_len; - /* we're making it bigger, give up */ - if (tot_in > 8192 && tot_in < tot_out) { + /* + * Check if we're making it bigger after two sectors. And if + * it is so, give up. + */ + if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) { ret = -E2BIG; goto out; } - /* we're all done */ - if (tot_in >= len) - break; - - if (tot_out > max_out) - break; - - bytes_left = len - tot_in; - put_page(in_page); - - start += PAGE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); - in_len = min(bytes_left, PAGE_SIZE); - } - - if (tot_out >= tot_in) { - ret = -E2BIG; - goto out; + /* Check if we have reached page boundary */ + if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + put_page(page_in); + page_in = NULL; + } } - /* store the size of all chunks of compressed data */ - sizes_ptr = page_address(pages[0]); - write_compress_length(sizes_ptr, tot_out); + /* Store the size of all chunks of compressed data */ + sizes_ptr = kmap_local_page(pages[0]); + write_compress_length(sizes_ptr, cur_out); + kunmap_local(sizes_ptr); ret = 0; - *total_out = tot_out; - *total_in = tot_in; + *total_out = cur_out; + *total_in = cur_in - start; out: - *out_pages = nr_pages; - - if (in_page) - put_page(in_page); - + *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } @@ -283,6 +294,7 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { + char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -290,9 +302,11 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + kaddr = kmap(cur_page); memcpy(dest + *cur_in - orig_in, - page_address(cur_page) + offset_in_page(*cur_in), + kaddr + offset_in_page(*cur_in), copy_len); + kunmap(cur_page); *cur_in += copy_len; } @@ -303,6 +317,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + char *kaddr; int ret; /* Compressed data length, can be unaligned */ u32 len_in; @@ -311,7 +326,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - len_in = read_compress_length(page_address(cb->compressed_pages[0])); + kaddr = kmap(cb->compressed_pages[0]); + len_in = read_compress_length(kaddr); + kunmap(cb->compressed_pages[0]); cur_in += LZO_LEN; /* @@ -345,8 +362,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - seg_len = read_compress_length(page_address(cur_page) + - offset_in_page(cur_in)); + kaddr = kmap(cur_page); + seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + kunmap(cur_page); cur_in += LZO_LEN; /* Copy the compressed segment payload into workspace */ @@ -431,7 +449,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = page_address(dest_page); + kaddr = kmap_local_page(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); /* @@ -441,6 +459,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, */ if (bytes < destlen) memset(kaddr+bytes, 0, destlen-bytes); + kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8d268ca8aa7..0e239a4c3b26 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -60,8 +60,7 @@ enum btrfs_rbio_ops { }; struct btrfs_raid_bio { - struct btrfs_fs_info *fs_info; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; /* while we're doing rmw on a stripe * we put it into a hash table so we can @@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work); static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) { btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); + btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->bbio->raid_map[0]; + u64 num = rbio->bioc->raid_map[0]; /* * we shift down quite a bit. We're using byte @@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; h = table->table + bucket; /* hold the lock for the bucket because we may be @@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; spin_lock_irqsave(&table->cache_lock, flags); __remove_rbio_from_cache(rbio); @@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; spin_lock_irqsave(&table->cache_lock, flags); spin_lock(&rbio->bio_list_lock); @@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->bbio->raid_map[0] != - cur->bbio->raid_map[0]) + if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) return 0; /* we can't merge with different operations */ @@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; - h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) + if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) continue; spin_lock(&cur->bio_list_lock); @@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) int keep_cache = 0; bucket = rbio_bucket(rbio); - h = rbio->fs_info->stripe_hash_table->table + bucket; + h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; if (list_empty(&rbio->plug_list)) cache_rbio(rbio); @@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) } } - btrfs_put_bbio(rbio->bbio); + btrfs_put_bioc(rbio->bioc); kfree(rbio); } @@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *extra; if (rbio->generic_bio_cnt) - btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio) /* OK, we have read all the stripes we need to. */ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bbio->max_errors; + 0 : rbio->bioc->max_errors; if (atomic_read(&rbio->error) > max_errors) err = BLK_STS_IOERR; @@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; int num_pages = rbio_nr_pages(stripe_len, real_stripes); int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; @@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); - rbio->bbio = bbio; - rbio->fs_info = fs_info; + rbio->bioc = bioc; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->real_stripes = real_stripes; @@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); #undef CONSUME_ALLOC - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) nr_data = real_stripes - 1; - else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) nr_data = real_stripes - 2; else BUG(); @@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, struct bio *last = bio_list->tail; int ret; struct bio *bio; - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; u64 disk_start; - stripe = &rbio->bbio->stripes[stripe_nr]; + stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + (page_index << PAGE_SHIFT); /* if the device is missing, just fail this stripe */ @@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - btrfs_io_bio(bio)->device = stripe->dev; + bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); + btrfs_bio(bio)->device = stripe->dev; bio->bi_iter.bi_size = 0; bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; @@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) int i = 0; start = bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bbio->raid_map[0]; + stripe_offset = start - rbio->bioc->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_io_bio(bio)->iter; + bio->bi_iter = btrfs_bio(bio)->iter; bio_for_each_segment(bvec, bio, iter) { rbio->bio_pages[page_index + i] = bvec.bv_page; @@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) */ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { - struct btrfs_bio *bbio = rbio->bbio; + struct btrfs_io_context *bioc = rbio->bioc; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; int stripe; @@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } } - if (likely(!bbio->num_tgtdevs)) + if (likely(!bioc->num_tgtdevs)) goto write_data; for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bbio->tgtdev_map[stripe]) + if (!bioc->tgtdev_map[stripe]) continue; for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { @@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } ret = rbio_add_io_page(rbio, &bio_list, page, - rbio->bbio->tgtdev_map[stripe], + rbio->bioc->tgtdev_map[stripe], pagenr, rbio->stripe_len); if (ret) goto cleanup; @@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, { u64 physical = bio->bi_iter.bi_sector; int i; - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; physical <<= 9; - for (i = 0; i < rbio->bbio->num_stripes; i++) { - stripe = &rbio->bbio->stripes[i]; + for (i = 0; i < rbio->bioc->num_stripes; i++) { + stripe = &rbio->bioc->stripes[i]; if (in_range(physical, stripe->physical, rbio->stripe_len) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; @@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, int i; for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bbio->raid_map[i]; + u64 stripe_start = rbio->bioc->raid_map[i]; if (in_range(logical, stripe_start, rbio->stripe_len)) return i; @@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) goto cleanup; /* @@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_rmw_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len) +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return PTR_ERR(rbio); } bio_list_add(&rbio->bio_list, bio); @@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* * single failure, rebuild from parity raid5 * style @@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * here due to a crc mismatch and we can't give them the * data they want */ - if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bbio->raid_map[faila] == + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == RAID5_P_STRIPE) { err = BLK_STS_IOERR; goto cleanup; @@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto pstripe; } - if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { @@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); else __raid_recover_end_io(rbio); @@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * were up to date, or we might have no bios to read because * the devices were gone. */ - if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { + if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { __raid_recover_end_io(rbio); return 0; } else { @@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_recover_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2116,22 +2114,22 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - int mirror_num, int generic_io) +int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len, int mirror_num, int generic_io) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int ret; if (generic_io) { - ASSERT(bbio->mirror_num == mirror_num); - btrfs_io_bio(bio)->mirror_num = mirror_num; + ASSERT(bioc->mirror_num == mirror_num); + btrfs_bio(bio)->mirror_num = mirror_num; } - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) { if (generic_io) - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return PTR_ERR(rbio); } @@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { btrfs_warn(fs_info, - "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", +"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bbio->map_type); + (u64)bio->bi_iter.bi_size, bioc->map_type); if (generic_io) - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); kfree(rbio); return -EIO; } @@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; } else { - btrfs_get_bbio(bbio); + btrfs_get_bioc(bioc); } /* @@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * - * Caller must have already increased bio_counter for getting @bbio. + * Caller must have already increased bio_counter for getting @bioc. * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. */ -struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - struct btrfs_device *scrub_dev, - unsigned long *dbitmap, int stripe_nsectors) +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, + u64 stripe_len, struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, rbio->operation = BTRFS_RBIO_PARITY_SCRUB; /* - * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted + * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted * to the end position, so this search can start from the first parity * stripe. */ for (i = rbio->nr_data; i < rbio->real_stripes; i++) { - if (bbio->stripes[i].dev == scrub_dev) { + if (bioc->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } @@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); /* - * We have already increased bio_counter when getting bbio, record it + * We have already increased bio_counter when getting bioc, record it * so we can free it at rbio_orig_end_io(). */ rbio->generic_bio_cnt = 1; @@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, int stripe_offset; int index; - ASSERT(logical >= rbio->bbio->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + + ASSERT(logical >= rbio->bioc->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + rbio->stripe_len * rbio->nr_data); - stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); + stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); index = stripe_offset >> PAGE_SHIFT; rbio->bio_pages[index] = page; } @@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { - struct btrfs_bio *bbio = rbio->bbio; + struct btrfs_io_context *bioc = rbio->bioc; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = rbio->finish_pbitmap; int nr_data = rbio->nr_data; @@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, else BUG(); - if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { + if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); } @@ -2435,7 +2433,7 @@ writeback: page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); ret = rbio_add_io_page(rbio, &bio_list, page, - bbio->tgtdev_map[rbio->scrubp], + bioc->tgtdev_map[rbio->scrubp], pagenr, rbio->stripe_len); if (ret) goto cleanup; @@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) */ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) goto cleanup; if (rbio->faila >= 0 || rbio->failb >= 0) { @@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) * the data, so the capability of the repair is declined. * (In the case of RAID5, we can not repair anything) */ - if (dfail > rbio->bbio->max_errors - 1) + if (dfail > rbio->bioc->max_errors - 1) goto cleanup; /* @@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid56_parity_scrub_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, + u64 length) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bbio, length); + rbio = alloc_rbio(fs_info, bioc, length); if (IS_ERR(rbio)) return NULL; @@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, } /* - * When we get bbio, we have already increased bio_counter, record it + * When we get bioc, we have already increased bio_counter, record it * so we can free it at rbio_orig_end_io() */ rbio->generic_bio_cnt = 1; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 2503485db859..72c00fc284b5 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - int mirror_num, int generic_io); -int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len); +int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len, int mirror_num, int generic_io); +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, u64 logical); -struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - struct btrfs_device *scrub_dev, - unsigned long *dbitmap, int stripe_nsectors); +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, u64 stripe_len, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, + u64 length); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 06713a8fe26b..eb96fdc3be25 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -227,7 +227,7 @@ start_machine: } static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, - struct btrfs_bio *bbio) + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = dev->fs_info; int ret; @@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, kref_init(&zone->refcnt); zone->elems = 0; zone->device = dev; /* our device always sits at index 0 */ - for (i = 0; i < bbio->num_stripes; ++i) { + for (i = 0; i < bioc->num_stripes; ++i) { /* bounds have already been checked */ - zone->devs[i] = bbio->stripes[i].dev; + zone->devs[i] = bioc->stripes[i].dev; } - zone->ndevs = bbio->num_stripes; + zone->ndevs = bioc->num_stripes; spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, @@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, int ret; struct reada_extent *re = NULL; struct reada_extent *re_exist = NULL; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; u64 length; @@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, */ length = fs_info->nodesize; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0); - if (ret || !bbio || length < fs_info->nodesize) + &length, &bioc, 0); + if (ret || !bioc || length < fs_info->nodesize) goto error; - if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { + if (bioc->num_stripes > BTRFS_MAX_MIRRORS) { btrfs_err(fs_info, "readahead: more than %d copies not supported", BTRFS_MAX_MIRRORS); goto error; } - real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + real_stripes = bioc->num_stripes - bioc->num_tgtdevs; for (nzones = 0; nzones < real_stripes; ++nzones) { struct reada_zone *zone; - dev = bbio->stripes[nzones].dev; + dev = bioc->stripes[nzones].dev; /* cannot read ahead on missing device. */ if (!dev->bdev) continue; - zone = reada_find_zone(dev, logical, bbio); + zone = reada_find_zone(dev, logical, bioc); if (!zone) continue; @@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!have_zone) goto error; - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return re; error: @@ -488,7 +488,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); kfree(re); return re_exist; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d2062d5f71dd..e2b9f8616501 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.root; + ref_root = generic_ref->tree_ref.owning_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.ref_root; + ref_root = generic_ref->data_ref.owning_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9b0814318e72..e0f93b357548 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, file_offset, block_size); btrfs_page_set_dirty(fs_info, page, file_offset, block_size); out_unlock: if (page) { @@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { - int ret; + int ret = 0; u64 i, tail_len, chunk_count; struct btrfs_root *root_dst = BTRFS_I(dst)->root; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 914d403b4415..33a0ee7ac590 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -25,6 +25,7 @@ #include "backref.h" #include "misc.h" #include "subpage.h" +#include "zoned.h" /* * Relocation overview @@ -1145,9 +1146,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(leaf, fi); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, num_bytes, parent); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, + root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1156,9 +1157,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, parent); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, + root->root_key.objectid, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1367,8 +1368,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, blocksize, path->nodes[level]->start); - ref.skip_qgroup = true; - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1376,8 +1377,8 @@ again: } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0); - ref.skip_qgroup = true; - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, + true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1386,8 +1387,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); - ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1396,8 +1397,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); - ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, + 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2473,9 +2474,9 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, node->eb->start, blocksize, upper->eb->start); - ref.real_root = root->root_key.objectid; btrfs_init_tree_ref(&ref, node->level, - btrfs_header_owner(upper->eb)); + btrfs_header_owner(upper->eb), + root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, @@ -2691,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, list_add_tail(&node->list, &rc->backref_cache.changed); } else { path->lowest_level = node->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, key, path, 0, 1); btrfs_release_path(path); + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); if (ret > 0) ret = 0; } @@ -2852,31 +2857,6 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; - /* - * On a zoned filesystem, we cannot preallocate the file region. - * Instead, we dirty and fiemap_write the region. - */ - if (btrfs_is_zoned(inode->root->fs_info)) { - struct btrfs_root *root = inode->root; - struct btrfs_trans_handle *trans; - - end = cluster->end - offset + 1; - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - i_size_write(&inode->vfs_inode, end); - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - return btrfs_end_transaction(trans); - } - btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -2903,9 +2883,8 @@ static noinline_for_stack int prealloc_file_extent_cluster( return ret; } -static noinline_for_stack -int setup_extent_mapping(struct inode *inode, u64 start, u64 end, - u64 block_start) +static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, + u64 start, u64 end, u64 block_start) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; @@ -3084,7 +3063,6 @@ release_page: static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; @@ -3105,7 +3083,7 @@ static int relocate_file_extent_cluster(struct inode *inode, file_ra_state_init(ra, inode->i_mapping); - ret = setup_extent_mapping(inode, cluster->start - offset, + ret = setup_relocation_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) goto out; @@ -3114,8 +3092,6 @@ static int relocate_file_extent_cluster(struct inode *inode, for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); - if (btrfs_is_zoned(fs_info) && !ret) - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3770,12 +3746,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; - if (btrfs_is_zoned(trans->fs_info)) - flags &= ~BTRFS_INODE_PREALLOC; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3790,7 +3762,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); @@ -4063,6 +4036,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group->start, rc->block_group->length); + ret = btrfs_zone_finish(rc->block_group); + WARN_ON(ret && ret != -EAGAIN); + while (1) { int finishes_stage; @@ -4386,8 +4362,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, if (!rc) return 0; - BUG_ON(rc->stage == UPDATE_DATA_PTRS && - root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 088641ba7a8e..cf82ea6f54fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -57,7 +57,7 @@ struct scrub_ctx; struct scrub_recover { refcount_t refs; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 map_length; }; @@ -254,7 +254,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static inline int scrub_is_page_on_raid56(struct scrub_page *spage) { return spage->recover && - (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -798,7 +798,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, { if (refcount_dec_and_test(&recover->refs)) { btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(recover->bbio); + btrfs_put_bioc(recover->bioc); kfree(recover); } } @@ -1027,8 +1027,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other = sblocks_for_recheck + mirror_index; } else { struct scrub_recover *r = sblock_bad->pagev[0]->recover; - int max_allowed = r->bbio->num_stripes - - r->bbio->num_tgtdevs; + int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; @@ -1218,14 +1217,14 @@ out: return 0; } -static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) +static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) { - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) return 2; - else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) return 3; else - return (int)bbio->num_stripes; + return (int)bioc->num_stripes; } static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, @@ -1269,7 +1268,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, u64 flags = original_sblock->pagev[0]->flags; u64 have_csum = original_sblock->pagev[0]->have_csum; struct scrub_recover *recover; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 sublen; u64 mapped_length; u64 stripe_offset; @@ -1288,7 +1287,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, while (length > 0) { sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; - bbio = NULL; + bioc = NULL; /* * With a length of sectorsize, each returned stripe represents @@ -1296,27 +1295,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, */ btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio); - if (ret || !bbio || mapped_length < sublen) { - btrfs_put_bbio(bbio); + logical, &mapped_length, &bioc); + if (ret || !bioc || mapped_length < sublen) { + btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); return -ENOMEM; } refcount_set(&recover->refs, 1); - recover->bbio = bbio; + recover->bioc = bioc; recover->map_length = mapped_length; BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); - nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { @@ -1348,17 +1347,17 @@ leave_nomem: sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, - bbio->map_type, - bbio->raid_map, + bioc->map_type, + bioc->raid_map, mapped_length, - bbio->num_stripes - - bbio->num_tgtdevs, + bioc->num_stripes - + bioc->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); - spage->physical = bbio->stripes[stripe_index].physical + + spage->physical = bioc->stripes[stripe_index].physical + stripe_offset; - spage->dev = bbio->stripes[stripe_index].dev; + spage->dev = bioc->stripes[stripe_index].dev; BUG_ON(page_index >= original_sblock->page_count); spage->physical_for_dev_replace = @@ -1401,7 +1400,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, bio->bi_end_io = scrub_bio_wait_endio; mirror_num = spage->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio, + ret = raid56_parity_recover(bio, spage->recover->bioc, spage->recover->map_length, mirror_num, 0); if (ret) @@ -1423,7 +1422,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, if (!first_page->dev->bdev) goto out; - bio = btrfs_io_bio_alloc(BIO_MAX_VECS); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { @@ -1480,7 +1479,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!spage->page); - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio_set_dev(bio, spage->dev->bdev); bio_add_page(bio, spage->page, fs_info->sectorsize, 0); @@ -1562,7 +1561,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio_set_dev(bio, spage_bad->dev->bdev); bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; @@ -1676,7 +1675,7 @@ again: sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); + bio = btrfs_bio_alloc(sctx->pages_per_wr_bio); sbio->bio = bio; } @@ -2102,7 +2101,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); + bio = btrfs_bio_alloc(sctx->pages_per_rd_bio); sbio->bio = bio; } @@ -2203,7 +2202,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2211,27 +2210,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio); - if (ret || !bbio || !bbio->raid_map) - goto bbio_out; + &length, &bioc); + if (ret || !bioc || !bioc->raid_map) + goto bioc_out; if (WARN_ON(!sctx->is_dev_replace || - !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { + !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* * We shouldn't be scrubbing a missing device. Even for dev * replace, we should only get here for RAID 5/6. We either * managed to mount something with no mirrors remaining or * there's a bug in scrub_remap_extent()/btrfs_map_block(). */ - goto bbio_out; + goto bioc_out; } - bio = btrfs_io_bio_alloc(0); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length); + rbio = raid56_alloc_missing_rbio(bio, bioc, length); if (!rbio) goto rbio_out; @@ -2249,9 +2248,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); -bbio_out: +bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); @@ -2826,7 +2825,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 length; int ret; @@ -2838,17 +2837,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio); - if (ret || !bbio || !bbio->raid_map) - goto bbio_out; + &length, &bioc); + if (ret || !bioc || !bioc->raid_map) + goto bioc_out; - bio = btrfs_io_bio_alloc(0); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio, - length, sparity->scrub_dev, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + sparity->scrub_dev, sparity->dbitmap, sparity->nsectors); if (!rbio) @@ -2860,9 +2859,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); -bbio_out: +bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -2901,7 +2900,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 flags; int ret; int slot; @@ -3044,22 +3043,22 @@ again: extent_len); mapped_length = extent_len; - bbio = NULL; + bioc = NULL; ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - extent_logical, &mapped_length, &bbio, + extent_logical, &mapped_length, &bioc, 0); if (!ret) { - if (!bbio || mapped_length < extent_len) + if (!bioc || mapped_length < extent_len) ret = -EIO; } if (ret) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); goto out; } - extent_physical = bbio->stripes[0].physical; - extent_mirror_num = bbio->mirror_num; - extent_dev = bbio->stripes[0].dev; - btrfs_put_bbio(bbio); + extent_physical = bioc->stripes[0].physical; + extent_mirror_num = bioc->mirror_num; + extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); ret = btrfs_lookup_csums_range(csum_root, extent_logical, @@ -3956,7 +3955,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_fs_info *fs_info = sctx->fs_info; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; /* Seed devices of a new filesystem has their own generation. */ @@ -4068,6 +4067,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; @@ -4115,7 +4115,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, goto out_free_ctx; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4288,11 +4288,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev) int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct btrfs_device *dev; struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (dev) sctx = dev->scrub_ctx; if (sctx) @@ -4309,20 +4310,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, int *extent_mirror_num) { u64 mapped_length; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int ret; mapped_length = extent_len; ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, - &mapped_length, &bbio, 0); - if (ret || !bbio || mapped_length < extent_len || - !bbio->stripes[0].dev->bdev) { - btrfs_put_bbio(bbio); + &mapped_length, &bioc, 0); + if (ret || !bioc || mapped_length < extent_len || + !bioc->stripes[0].dev->bdev) { + btrfs_put_bioc(bioc); return; } - *extent_physical = bbio->stripes[0].physical; - *extent_mirror_num = bbio->mirror_num; - *extent_dev = bbio->stripes[0].dev; - btrfs_put_bbio(bbio); + *extent_physical = bioc->stripes[0].physical; + *extent_mirror_num = bioc->mirror_num; + *extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 72f9b865e847..040324d71118 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -84,6 +84,8 @@ struct send_ctx { u64 total_send_size; u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ + /* Protocol version compatibility requested */ + u32 proto; struct btrfs_root *send_root; struct btrfs_root *parent_root; @@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, sctx->parent_root->root_key.objectid : 0)); } +__maybe_unused +static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) +{ + switch (sctx->proto) { + case 1: return cmd < __BTRFS_SEND_C_MAX_V1; + case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + default: return false; + } +} + static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * @@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) if (S_ISDIR(sctx->cur_inode_mode)) { ret = did_create_dir(sctx, sctx->cur_ino); if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + return ret; + else if (ret > 0) + return 0; } - ret = send_create_inode(sctx, sctx->cur_ino); - if (ret < 0) - goto out; - -out: - return ret; + return send_create_inode(sctx, sctx->cur_ino); } struct recorded_ref { @@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->flags = arg->flags; + if (arg->flags & BTRFS_SEND_FLAG_VERSION) { + if (arg->version > BTRFS_SEND_STREAM_VERSION) { + ret = -EPROTO; + goto out; + } + /* Zero means "use the highest version" */ + sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; + } else { + sctx->proto = 1; + } + sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { ret = -EBADF; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index de91488b7cd0..23bcefc84e49 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -48,6 +48,7 @@ struct btrfs_tlv_header { enum btrfs_send_cmd { BTRFS_SEND_C_UNSPEC, + /* Version 1 */ BTRFS_SEND_C_SUBVOL, BTRFS_SEND_C_SNAPSHOT, @@ -76,6 +77,12 @@ enum btrfs_send_cmd { BTRFS_SEND_C_END, BTRFS_SEND_C_UPDATE_EXTENT, + __BTRFS_SEND_C_MAX_V1, + + /* Version 2 */ + __BTRFS_SEND_C_MAX_V2, + + /* End */ __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index aa5be0b24987..48d77f360a24 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; + const bool aborted = BTRFS_FS_ERROR(fs_info); trace_btrfs_fail_all_tickets(fs_info, space_info); @@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (ticket->steal && + if (!aborted && ticket->steal && steal_from_global_rsv(fs_info, space_info, ticket)) return true; - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); remove_ticket(space_info, ticket); - ticket->error = -ENOSPC; + if (aborted) + ticket->error = -EIO; + else + ticket->error = -ENOSPC; wake_up(&ticket->wait); /* @@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, * here to see if we can make progress with the next ticket in * the list. */ - btrfs_try_granting_tickets(fs_info, space_info); + if (!aborted) + btrfs_try_granting_tickets(fs_info, space_info); } return (tickets_id != space_info->tickets_id); } @@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); } @@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) } else { flush_state = 0; } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; + } spin_unlock(&space_info->lock); } + return; + +aborted_fs: + maybe_fail_all_tickets(fs_info, space_info); + space_info->flush = 0; + spin_unlock(&space_info->lock); } void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb10e56ee31e..29bd8c7a7706 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -63,11 +63,41 @@ * This means a slightly higher tree locking latency. */ +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) +{ + unsigned int cur = 0; + unsigned int nr_bits; + + ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); + + nr_bits = PAGE_SIZE / sectorsize; + subpage_info->bitmap_nr_bits = nr_bits; + + subpage_info->uptodate_offset = cur; + cur += nr_bits; + + subpage_info->error_offset = cur; + cur += nr_bits; + + subpage_info->dirty_offset = cur; + cur += nr_bits; + + subpage_info->writeback_offset = cur; + cur += nr_bits; + + subpage_info->ordered_offset = cur; + cur += nr_bits; + + subpage_info->checked_offset = cur; + cur += nr_bits; + + subpage_info->total_nr_bits = cur; +} + int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { - struct btrfs_subpage *subpage = NULL; - int ret; + struct btrfs_subpage *subpage; /* * We have cases like a dummy extent buffer page, which is not mappped @@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, */ if (page->mapping) ASSERT(PageLocked(page)); + /* Either not subpage, or the page already has private attached */ if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) return 0; - ret = btrfs_alloc_subpage(fs_info, &subpage, type); - if (ret < 0) - return ret; + subpage = btrfs_alloc_subpage(fs_info, type); + if (IS_ERR(subpage)) + return PTR_ERR(subpage); + attach_page_private(page, subpage); return 0; } @@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, btrfs_free_subpage(subpage); } -int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - struct btrfs_subpage **ret, - enum btrfs_subpage_type type) +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type) { - if (fs_info->sectorsize == PAGE_SIZE) - return 0; + struct btrfs_subpage *ret; + unsigned int real_size; + + ASSERT(fs_info->sectorsize < PAGE_SIZE); + + real_size = struct_size(ret, bitmaps, + BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + ret = kzalloc(real_size, GFP_NOFS); + if (!ret) + return ERR_PTR(-ENOMEM); - *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); - if (!*ret) - return -ENOMEM; - spin_lock_init(&(*ret)->lock); + spin_lock_init(&ret->lock); if (type == BTRFS_SUBPAGE_METADATA) { - atomic_set(&(*ret)->eb_refs, 0); + atomic_set(&ret->eb_refs, 0); } else { - atomic_set(&(*ret)->readers, 0); - atomic_set(&(*ret)->writers, 0); + atomic_set(&ret->readers, 0); + atomic_set(&ret->writers, 0); } - return 0; + return ret; } void btrfs_free_subpage(struct btrfs_subpage *subpage) @@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) u32 orig_len = *len; *start = max_t(u64, page_offset(page), orig_start); - *len = min_t(u64, page_offset(page) + PAGE_SIZE, - orig_start + orig_len) - *start; + /* + * For certain call sites like btrfs_drop_pages(), we may have pages + * beyond the target range. In that case, just set @len to 0, subpage + * helpers can handle @len == 0 without any problem. + */ + if (page_offset(page) >= orig_start + orig_len) + *len = 0; + else + *len = min_t(u64, page_offset(page) + PAGE_SIZE, + orig_start + orig_len) - *start; } void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, @@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, btrfs_subpage_assert(fs_info, page, start, len); + /* + * We have call sites passing @lock_page into + * extent_clear_unlock_delalloc() for compression path. + * + * This @locked_page is locked by plain lock_page(), thus its + * subpage::writers is 0. Handle them in a special way. + */ + if (atomic_read(&subpage->writers) == 0) + return true; + ASSERT(atomic_read(&subpage->writers) >= nbits); return atomic_sub_and_test(nbits, &subpage->writers); } @@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, unlock_page(page); } -/* - * Convert the [start, start + len) range into a u16 bitmap - * - * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. - */ -static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start, + unsigned int nbits) { - const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; - const int nbits = len >> fs_info->sectorsize_bits; + unsigned int found_zero; - btrfs_subpage_assert(fs_info, page, start, len); + found_zero = find_next_zero_bit(addr, start + nbits, start); + if (found_zero == start + nbits) + return true; + return false; +} - /* - * Here nbits can be 16, thus can go beyond u16 range. We make the - * first left shift to be calculate in unsigned long (at least u32), - * then truncate the result to u16. - */ - return (u16)(((1UL << nbits) - 1) << bit_start); +static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start, + unsigned int nbits) +{ + unsigned int found_set; + + found_set = find_next_bit(addr, start + nbits, start); + if (found_set == start + nbits) + return true; + return false; } +#define subpage_calc_start_bit(fs_info, page, name, start, len) \ +({ \ + unsigned int start_bit; \ + \ + btrfs_subpage_assert(fs_info, page, start, len); \ + start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + start_bit += fs_info->subpage_info->name##_offset; \ + start_bit; \ +}) + +#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ + bitmap_test_range_all_set(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + +#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ + bitmap_test_range_all_zero(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->uptodate_bitmap |= tmp; - if (subpage->uptodate_bitmap == U16_MAX) + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) SetPageUptodate(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->uptodate_bitmap &= ~tmp; + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); ClearPageUptodate(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + error, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->error_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); SetPageError(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + error, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->error_bitmap &= ~tmp; - if (subpage->error_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, error)) ClearPageError(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->dirty_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&subpage->lock, flags); set_page_dirty(page); } @@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); unsigned long flags; bool last = false; spin_lock_irqsave(&subpage->lock, flags); - subpage->dirty_bitmap &= ~tmp; - if (subpage->dirty_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->writeback_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); set_page_writeback(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->writeback_bitmap &= ~tmp; - if (subpage->writeback_bitmap == 0) { + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { ASSERT(PageWriteback(page)); end_page_writeback(page); } @@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->ordered_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); SetPageOrdered(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->ordered_bitmap &= ~tmp; - if (subpage->ordered_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) ClearPageOrdered(page); spin_unlock_irqrestore(&subpage->lock, flags); } + +void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + SetPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + ClearPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + name, start, len); \ unsigned long flags; \ bool ret; \ \ spin_lock_irqsave(&subpage->lock, flags); \ - ret = ((subpage->name##_bitmap & tmp) == tmp); \ + ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \ + len >> fs_info->sectorsize_bits); \ spin_unlock_irqrestore(&subpage->lock, flags); \ return ret; \ } @@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, PageOrdered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit @@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->private); - ASSERT(subpage->dirty_bitmap == 0); + ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); +} + +/* + * Handle different locked pages with different page sizes: + * + * - Page locked by plain lock_page() + * It should not have any subpage::writers count. + * Can be unlocked by unlock_page(). + * This is the most common locked page for __extent_writepage() called + * inside extent_write_cache_pages() or extent_write_full_page(). + * Rarer cases include the @locked_page from extent_write_locked_range(). + * + * - Page locked by lock_delalloc_pages() + * There is only one caller, all pages except @locked_page for + * extent_write_locked_range(). + * In this case, we have to call subpage helper to handle the case. + */ +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len) +{ + struct btrfs_subpage *subpage; + + ASSERT(PageLocked(page)); + /* For regular page size case, we just unlock the page */ + if (fs_info->sectorsize == PAGE_SIZE) + return unlock_page(page); + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers)) + /* No writers, locked by plain lock_page() */ + return unlock_page(page); + + /* Have writers, use proper subpage helper to end it */ + btrfs_page_end_writer_lock(fs_info, page, start, len); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 0120948f37a1..7accb5c40d33 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,38 @@ #include <linux/spinlock.h> /* - * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap - * is sufficient. Regular bitmap_* is not used due to size reasons. + * Extra info for subpapge bitmap. + * + * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into + * one larger bitmap. + * + * This structure records how they are organized in the bitmap: + * + * /- uptodate_offset /- error_offset /- dirty_offset + * | | | + * v v v + * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o| + * |<- bitmap_nr_bits ->| + * |<--------------- total_nr_bits ---------------->| */ -#define BTRFS_SUBPAGE_BITMAP_SIZE 16 +struct btrfs_subpage_info { + /* Number of bits for each bitmap */ + unsigned int bitmap_nr_bits; + + /* Total number of bits for the whole bitmap */ + unsigned int total_nr_bits; + + /* + * *_start indicates where the bitmap starts, the length is always + * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. + */ + unsigned int uptodate_offset; + unsigned int error_offset; + unsigned int dirty_offset; + unsigned int writeback_offset; + unsigned int ordered_offset; + unsigned int checked_offset; +}; /* * Structure to trace status of each sector inside a page, attached to @@ -18,10 +46,6 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - u16 uptodate_bitmap; - u16 error_bitmap; - u16 dirty_bitmap; - u16 writeback_bitmap; /* * Both data and metadata needs to track how many readers are for the * page. @@ -38,14 +62,11 @@ struct btrfs_subpage { * manages whether the subpage can be detached. */ atomic_t eb_refs; - /* Structures only used by data */ - struct { - atomic_t writers; - /* Tracke pending ordered extent in this sector */ - u16 ordered_bitmap; - }; + /* Structures only used by data */ + atomic_t writers; }; + unsigned long bitmaps[]; }; enum btrfs_subpage_type { @@ -53,15 +74,15 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct page *page); /* Allocate additional data where page represents more than one sector */ -int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - struct btrfs_subpage **ret, - enum btrfs_subpage_type type); +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, @@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); +DECLARE_BTRFS_SUBPAGE_OPS(checked); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct page *page); +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 537d90bf5d84..a1c54a2c787c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_close_devices; } - bdev = fs_devices->latest_bdev; + bdev = fs_devices->latest_dev->bdev; s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, fs_info); if (IS_ERR(s)) { @@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; } else { - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { btrfs_err(fs_info, "Remounting read-write after error is not allowed"); ret = -EINVAL; @@ -2463,30 +2463,16 @@ static int btrfs_unfreeze(struct super_block *sb) static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); - struct btrfs_device *dev, *first_dev = NULL; /* - * Lightweight locking of the devices. We should not need - * device_list_mutex here as we only read the device data and the list - * is protected by RCU. Even if a device is deleted during the list - * traversals, we'll get valid data, the freeing callback will wait at - * least until the rcu_read_unlock. + * There should be always a valid pointer in latest_dev, it may be stale + * for a short moment in case it's being deleted but still valid until + * the end of RCU grace period. */ rcu_read_lock(); - list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) - continue; - if (!dev->name) - continue; - if (!first_dev || dev->devid < first_dev->devid) - first_dev = dev; - } - - if (first_dev) - seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\"); - else - WARN_ON(1); + seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\"); rcu_read_unlock(); + return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 25a6f587852b..f9eff3b0f77c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj, } else val = can_modify_feature(fa); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, @@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = { static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return scnprintf(buf, PAGE_SIZE, "0\n"); + return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); @@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj, * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - (i == 0 ? "" : " "), btrfs_super_csum_name(i)); + ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "), + btrfs_super_csum_name(i)); } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); @@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); static ssize_t send_stream_version_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); + return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION); } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); @@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj, int i; for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - (i ? " " : ""), rescue_opts[i]); - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_rescue_options, @@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, /* 4K sector size is also supported with 64K page size */ if (PAGE_SIZE == SZ_64K) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K); + ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); /* Only sectorsize == PAGE_SIZE is now supported */ - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); + ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; } @@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); @@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); @@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - fs_info->discard_ctl.discard_bitmap_bytes); + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); @@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - fs_info->discard_ctl.discard_extent_bytes); + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - READ_ONCE(fs_info->discard_ctl.iops_limit)); + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.iops_limit)); } static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, @@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - READ_ONCE(fs_info->discard_ctl.kbps_limit)); + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.kbps_limit)); } static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, @@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - READ_ONCE(fs_info->discard_ctl.max_discard_size)); + return sysfs_emit(buf, "%llu\n", + READ_ONCE(fs_info->discard_ctl.max_discard_size)); } static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, @@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) val = *value_ptr; if (lock) spin_unlock(lock); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, @@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, val += block_group->used; } up_read(&sinfo->groups_sem); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } /* @@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj, ssize_t ret; spin_lock(&fs_info->super_lock); - ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; @@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj, int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); - return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override); + return sysfs_emit(buf, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, @@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%pU\n", - fs_info->fs_devices->metadata_uuid); + return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid); } BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); @@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); - return scnprintf(buf, PAGE_SIZE, "%s (%s)\n", - btrfs_super_csum_name(csum_type), - crypto_shash_driver_name(fs_info->csum_shash)); + return sysfs_emit(buf, "%s (%s)\n", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(fs_info->csum_shash)); } BTRFS_ATTR(, checksum, btrfs_checksum_show); @@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, str = "UNKNOWN\n"; break; } - return scnprintf(buf, PAGE_SIZE, "%s", str); + return sysfs_emit(buf, "%s", str); } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); @@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation); + return sysfs_emit(buf, "%llu\n", fs_info->generation); } BTRFS_ATTR(, generation, btrfs_generation_show); @@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); ssize_t ret; - ret = scnprintf(buf, PAGE_SIZE, "%d\n", - READ_ONCE(fs_info->bg_reclaim_threshold)); + ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); return ret; } @@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); @@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); @@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); @@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - READ_ONCE(device->scrub_speed_max)); + return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max)); } static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, @@ -1538,7 +1533,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); @@ -1549,14 +1544,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, devid_kobj); if (!device->dev_stats_valid) - return scnprintf(buf, PAGE_SIZE, "invalid\n"); + return sysfs_emit(buf, "invalid\n"); /* * Print all at once so we get a snapshot of all values from the same * time. Keep them in sync and in order of definition of * btrfs_dev_stat_values. */ - return scnprintf(buf, PAGE_SIZE, + return sysfs_emit(buf, "write_errs %d\n" "read_errs %d\n" "flush_errs %d\n" diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index df54cdfdc250..2a95f7224e18 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, path, &key, value_len); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 73e96d505f4f..c2e72e7a8ff0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize) } set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize) goto out_bits; } start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; } - if (end != (u64)-1) { + if (end != test_start + PAGE_SIZE - 1) { test_err("did not return the proper end offset"); goto out_bits; } @@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize) /* We unlocked it in the previous test */ lock_page(locked_page); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; /* * Currently if we fail to find dirty pages in the delalloc range we * will adjust max_bytes down to PAGE_SIZE and then re-search. If diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index c9874b12d337..cac89c388131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, &path, &key, value_len); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, &path, &key, value_len); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 14b9fdc8aaa9..1c3a1189c0bd 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -331,7 +331,7 @@ loop: */ kfree(cur_trans); goto loop; - } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); kfree(cur_trans); return -EROFS; @@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool do_chunk_alloc = false; int ret; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(info); - if (TRANS_ABORTED(trans) || - test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { + if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) err = trans->aborted; @@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * abort to prevent writing a new superblock that reflects a * corrupt state (pointing to trees with unwritten nodes/leafs). */ - if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ret = -EROFS; goto cleanup_transaction; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7efc26aa82a..8ab33caf016f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,7 +94,7 @@ enum { }; static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, @@ -207,7 +207,7 @@ again: } atomic_inc(&root->log_writers); - if (ctx && !ctx->logging_new_name) { + if (!ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static noinline int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int do_overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) { int ret; u32 item_size; @@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, item_size = btrfs_item_size_nr(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); - /* look for the key in the destination tree */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; + /* Our caller must have done a search for the key for us. */ + ASSERT(path->nodes[0] != NULL); + + /* + * And the slot must point to the exact key or the slot where the key + * should be at (the first item with a key greater than 'key') + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + ret = btrfs_comp_cpu_keys(&found_key, key); + ASSERT(ret >= 0); + } else { + ret = 1; + } if (ret == 0) { char *src_copy; @@ -585,6 +583,36 @@ no_copy: } /* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + + return do_overwrite_item(trans, root, path, eb, slot, key); +} + +/* * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ @@ -761,7 +789,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, 0); btrfs_init_data_ref(&ref, root->root_key.objectid, - key->objectid, offset); + key->objectid, offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; @@ -893,11 +921,11 @@ out: * item */ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_dir_item *di) { + struct btrfs_root *root = dir->root; struct inode *inode; char *name; int name_len; @@ -926,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, + ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, name_len); if (ret) goto out; @@ -939,9 +967,11 @@ out: } /* - * helper function to see if a given name and sequence number found - * in an inode back reference are already in a directory and correctly - * point to this inode + * See if a given name and sequence number found in an inode back reference are + * already in a directory and correctly point to this inode. + * + * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it + * exists. */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, @@ -950,29 +980,34 @@ static noinline int inode_in_dir(struct btrfs_root *root, { struct btrfs_dir_item *di; struct btrfs_key location; - int match = 0; + int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (di) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; - } else + } else { goto out; - btrfs_release_path(path); + } + btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else + if (IS_ERR(di)) { + ret = PTR_ERR(di); goto out; - match = 1; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid == objectid) + ret = 1; + } out: btrfs_release_path(path); - return match; + return ret; } /* @@ -1084,7 +1119,7 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, root, dir, inode, + ret = btrfs_unlink_inode(trans, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) @@ -1155,7 +1190,7 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, root, + ret = btrfs_unlink_inode(trans, BTRFS_I(victim_parent), inode, victim_name, @@ -1182,8 +1217,10 @@ next: /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); if (ret) return ret; } @@ -1192,8 +1229,10 @@ next: /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); if (ret) return ret; } @@ -1313,7 +1352,7 @@ again: kfree(name); goto out; } - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), inode, name, namelen); kfree(name); iput(dir); @@ -1374,10 +1413,11 @@ out: return ret; } -static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, +static int add_link(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, const char *name, int namelen, u64 ref_index) { + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_dir_item *dir_item; struct btrfs_key key; struct btrfs_path *path; @@ -1411,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = -ENOENT; goto out; } - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), name, namelen); if (ret) goto out; @@ -1517,10 +1557,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - /* if we already have a perfect match, we're done */ - if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen)) { + ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, + name, namelen); + if (ret < 0) { + goto out; + } else if (ret == 0) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -1555,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_inode_ref_exists(inode, dir, key->type, name, namelen); if (ret > 0) { - ret = btrfs_unlink_inode(trans, root, + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen); @@ -1571,7 +1613,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; /* insert our name */ - ret = add_link(trans, root, dir, inode, name, namelen, + ret = add_link(trans, dir, inode, name, namelen, ref_index); if (ret) goto out; @@ -1580,6 +1622,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; } + /* Else, ret == 1, we already have a perfect match, we're done. */ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); @@ -1936,8 +1979,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct inode *dir; u8 log_type; - int exists; - int ret = 0; + bool exists; + int ret; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); bool name_added = false; @@ -1957,12 +2000,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len); btrfs_dir_item_key_to_cpu(eb, di, &log_key); - exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); - if (exists == 0) - exists = 1; - else - exists = 0; + ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); + if (ret < 0) + goto out; + exists = (ret == 0); + ret = 0; if (key->type == BTRFS_DIR_ITEM_KEY) { dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, @@ -1977,7 +2020,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } - if (IS_ERR_OR_NULL(dst_di)) { + + if (IS_ERR(dst_di)) { + ret = PTR_ERR(dst_di); + goto out; + } else if (!dst_di) { /* we need a sequence number to insert, so we only * do inserts for the BTRFS_DIR_INDEX_KEY types */ @@ -2003,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!exists) goto out; - ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); + ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di); if (ret) goto out; @@ -2233,13 +2280,13 @@ out: * to is unlinked */ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, struct inode *dir, struct btrfs_key *dir_key) { + struct btrfs_root *root = BTRFS_I(dir)->root; int ret; struct extent_buffer *eb; int slot; @@ -2281,7 +2328,7 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || log_di == ERR_PTR(-ENOENT)) { + if (!log_di) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -2300,7 +2347,7 @@ again: } inc_nlink(inode); - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, name_len); if (!ret) ret = btrfs_run_delayed_items(trans); @@ -2482,7 +2529,9 @@ again: else { ret = find_dir_range(log, path, dirid, key_type, &range_start, &range_end); - if (ret != 0) + if (ret < 0) + goto out; + else if (ret > 0) break; } @@ -2511,7 +2560,7 @@ again: if (found_key.offset > range_end) break; - ret = check_item_in_log(trans, root, log, path, + ret = check_item_in_log(trans, log, path, log_path, dir, &found_key); if (ret) @@ -3019,9 +3068,6 @@ static void wait_for_writer(struct btrfs_root *root) static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - if (!ctx) - return; - mutex_lock(&root->log_mutex); list_del_init(&ctx->list); mutex_unlock(&root->log_mutex); @@ -3310,7 +3356,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * writing the super here would result in transid mismatches. If there * is an error here just bail. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ret = -EIO; btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); @@ -3434,6 +3480,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans, if (inode->logged_trans == trans->transid) return true; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) + return false; + /* * The inode's logged_trans is always 0 when we load it (because it is * not persisted in the inode item or elsewhere). So if it is 0, the @@ -3472,10 +3521,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans, * This optimizations allows us to avoid relogging the entire inode * or the entire directory. */ -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *dir, u64 index) +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *dir, u64 index) { struct btrfs_root *log; struct btrfs_dir_item *di; @@ -3485,11 +3534,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) - return 0; + return; ret = join_running_log_trans(root); if (ret) - return 0; + return; mutex_lock(&dir->log_mutex); @@ -3537,49 +3586,36 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (err == -ENOSPC) { + if (err < 0) btrfs_set_log_full_commit(trans); - err = 0; - } else if (err < 0 && err != -ENOENT) { - /* ENOENT can be returned if the entry hasn't been fsynced yet */ - btrfs_abort_transaction(trans, err); - } - btrfs_end_log_trans(root); - - return err; } /* see comments for btrfs_del_dir_entries_in_log */ -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *inode, u64 dirid) +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; u64 index; int ret; if (!inode_logged(trans, inode)) - return 0; + return; ret = join_running_log_trans(root); if (ret) - return 0; + return; log = root->log_root; mutex_lock(&inode->log_mutex); ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), dirid, &index); mutex_unlock(&inode->log_mutex); - if (ret == -ENOSPC) { + if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); - ret = 0; - } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); - - return ret; } /* @@ -3615,31 +3651,231 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, return 0; } +static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct extent_buffer *src, + struct btrfs_path *dst_path, + int start_slot, + int count) +{ + char *ins_data = NULL; + struct btrfs_item_batch batch; + struct extent_buffer *dst; + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_key key; + u32 item_size; + int ret; + int i; + + ASSERT(count > 0); + batch.nr = count; + + if (count == 1) { + btrfs_item_key_to_cpu(src, &key, start_slot); + item_size = btrfs_item_size_nr(src, start_slot); + batch.keys = &key; + batch.data_sizes = &item_size; + batch.total_data_size = item_size; + } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; + + ins_data = kmalloc(count * sizeof(u32) + + count * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + + for (i = 0; i < count; i++) { + const int slot = start_slot + i; + + btrfs_item_key_to_cpu(src, &ins_keys[i], slot); + ins_sizes[i] = btrfs_item_size_nr(src, slot); + batch.total_data_size += ins_sizes[i]; + } + } + + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); + if (ret) + goto out; + + dst = dst_path->nodes[0]; + /* + * Copy all the items in bulk, in a single copy operation. Item data is + * organized such that it's placed at the end of a leaf and from right + * to left. For example, the data for the second item ends at an offset + * that matches the offset where the data for the first item starts, the + * data for the third item ends at an offset that matches the offset + * where the data of the second items starts, and so on. + * Therefore our source and destination start offsets for copy match the + * offsets of the last items (highest slots). + */ + dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); + src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); + copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); + btrfs_release_path(dst_path); +out: + kfree(ins_data); + + return ret; +} + +static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, + int key_type, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = inode->root->log_root; + struct extent_buffer *src = path->nodes[0]; + const int nritems = btrfs_header_nritems(src); + const u64 ino = btrfs_ino(inode); + const bool inode_logged_before = inode_logged(trans, inode); + u64 last_logged_key_offset; + bool last_found = false; + int batch_start = 0; + int batch_size = 0; + int i; + + if (key_type == BTRFS_DIR_ITEM_KEY) + last_logged_key_offset = inode->last_dir_item_offset; + else + last_logged_key_offset = inode->last_dir_index_offset; + + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + int ret; + + btrfs_item_key_to_cpu(src, &key, i); + + if (key.objectid != ino || key.type != key_type) { + last_found = true; + break; + } + + ctx->last_dir_item_offset = key.offset; + /* + * We must make sure that when we log a directory entry, the + * corresponding inode, after log replay, has a matching link + * count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * <crash> + * <mount fs and log replay> + * + * Would result in a fsync log that when replayed, our file inode + * would have a link count of 1, but we get two directory entries + * pointing to the same inode. After removing one of the names, + * it would not be possible to remove the other name, which + * resulted always in stale file handle errors, and would not be + * possible to rmdir the parent directory, since its i_size could + * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, + * resulting in -ENOTEMPTY errors. + */ + if (!ctx->log_new_dentries) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &di_key); + if ((btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + di_key.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; + } + + if (!inode_logged_before) + goto add_to_batch; + + /* + * If we were logged before and have logged dir items, we can skip + * checking if any item with a key offset larger than the last one + * we logged is in the log tree, saving time and avoiding adding + * contention on the log tree. + */ + if (key.offset > last_logged_key_offset) + goto add_to_batch; + /* + * Check if the key was already logged before. If not we can add + * it to a batch for bulk insertion. + */ + ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + btrfs_release_path(dst_path); + goto add_to_batch; + } + + /* + * Item exists in the log. Overwrite the item in the log if it + * has different content or do nothing if it has exactly the same + * content. And then flush the current batch if any - do it after + * overwriting the current item, or we would deadlock otherwise, + * since we are holding a path for the existing item. + */ + ret = do_overwrite_item(trans, log, dst_path, src, i, &key); + if (ret < 0) + return ret; + + if (batch_size > 0) { + ret = flush_dir_items_batch(trans, log, src, dst_path, + batch_start, batch_size); + if (ret < 0) + return ret; + batch_size = 0; + } + continue; +add_to_batch: + if (batch_size == 0) + batch_start = i; + batch_size++; + } + + if (batch_size > 0) { + int ret; + + ret = flush_dir_items_batch(trans, log, src, dst_path, + batch_start, batch_size); + if (ret < 0) + return ret; + } + + return last_found ? 1 : 0; +} + /* * log all the items included in the current transaction for a given * directory. This also creates the range items in the log tree required * to replay anything deleted before the fsync */ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; + struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - struct extent_buffer *src; int err = 0; int ret; - int i; - int nritems; u64 first_offset = min_offset; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); - log = root->log_root; - min_key.objectid = ino; min_key.type = key_type; min_key.offset = min_offset; @@ -3713,62 +3949,14 @@ search: * from our directory */ while (1) { - struct btrfs_key tmp; - src = path->nodes[0]; - nritems = btrfs_header_nritems(src); - for (i = path->slots[0]; i < nritems; i++) { - struct btrfs_dir_item *di; - - btrfs_item_key_to_cpu(src, &min_key, i); - - if (min_key.objectid != ino || min_key.type != key_type) - goto done; - - if (need_resched()) { - btrfs_release_path(path); - cond_resched(); - goto search; - } - - ret = overwrite_item(trans, log, dst_path, src, i, - &min_key); - if (ret) { + ret = process_dir_items_leaf(trans, inode, path, dst_path, + key_type, ctx); + if (ret != 0) { + if (ret < 0) err = ret; - goto done; - } - - /* - * We must make sure that when we log a directory entry, - * the corresponding inode, after log replay, has a - * matching link count. For example: - * - * touch foo - * mkdir mydir - * sync - * ln foo mydir/bar - * xfs_io -c "fsync" mydir - * <crash> - * <mount fs and log replay> - * - * Would result in a fsync log that when replayed, our - * file inode would have a link count of 1, but we get - * two directory entries pointing to the same inode. - * After removing one of the names, it would not be - * possible to remove the other name, which resulted - * always in stale file handle errors, and would not - * be possible to rmdir the parent directory, since - * its i_size could never decrement to the value - * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. - */ - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(src, di, &tmp); - if (ctx && - (btrfs_dir_transid(src, di) == trans->transid || - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && - tmp.type != BTRFS_ROOT_ITEM_KEY) - ctx->log_new_dentries = true; + goto done; } - path->slots[0] = nritems; + path->slots[0] = btrfs_header_nritems(path->nodes[0]); /* * look ahead to the next item and see if it is also @@ -3782,21 +3970,26 @@ search: err = ret; goto done; } - btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.objectid != ino || tmp.type != key_type) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); + if (min_key.objectid != ino || min_key.type != key_type) { last_offset = (u64)-1; goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], - &tmp); + &min_key); if (ret) err = ret; else - last_offset = tmp.offset; + last_offset = min_key.offset; goto done; } + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } } done: btrfs_release_path(path); @@ -3829,7 +4022,7 @@ done: * key logged by this transaction. */ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx) @@ -3839,11 +4032,33 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, int ret; int key_type = BTRFS_DIR_ITEM_KEY; + /* + * If this is the first time we are being logged in the current + * transaction, or we were logged before but the inode was evicted and + * reloaded later, in which case its logged_trans is 0, reset the values + * of the last logged key offsets. Note that we don't use the helper + * function inode_logged() here - that is because the function returns + * true after an inode eviction, assuming the worst case as it can not + * know for sure if the inode was logged before. So we can not skip key + * searches in the case the inode was evicted, because it may not have + * been logged in this transaction and may have been logged in a past + * transaction, so we need to reset the last dir item and index offsets + * to (u64)-1. + */ + if (inode->logged_trans != trans->transid) { + inode->last_dir_item_offset = (u64)-1; + inode->last_dir_index_offset = (u64)-1; + } again: min_key = 0; max_key = 0; + if (key_type == BTRFS_DIR_ITEM_KEY) + ctx->last_dir_item_offset = inode->last_dir_item_offset; + else + ctx->last_dir_item_offset = inode->last_dir_index_offset; + while (1) { - ret = log_dir_items(trans, root, inode, path, dst_path, key_type, + ret = log_dir_items(trans, inode, path, dst_path, key_type, ctx, min_key, &max_key); if (ret) return ret; @@ -3853,8 +4068,11 @@ again: } if (key_type == BTRFS_DIR_ITEM_KEY) { + inode->last_dir_item_offset = ctx->last_dir_item_offset; key_type = BTRFS_DIR_INDEX_KEY; goto again; + } else { + inode->last_dir_index_offset = ctx->last_dir_item_offset; } return 0; } @@ -3865,17 +4083,21 @@ again: * This cannot be run for file data extents because it does not * free the extents they point to. */ -static int drop_objectid_items(struct btrfs_trans_handle *trans, +static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - u64 objectid, int max_key_type) + struct btrfs_inode *inode, + int max_key_type) { int ret; struct btrfs_key key; struct btrfs_key found_key; int start_slot; - key.objectid = objectid; + if (!inode_logged(trans, inode)) + return 0; + + key.objectid = btrfs_ino(inode); key.type = max_key_type; key.offset = (u64)-1; @@ -3892,7 +4114,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - if (found_key.objectid != objectid) + if (found_key.objectid != key.objectid) break; found_key.offset = 0; @@ -3917,6 +4139,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, return ret; } +static int truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_inode *inode, + u64 new_size, u32 min_type) +{ + int ret; + + do { + ret = btrfs_truncate_inode_items(trans, log_root, inode, + new_size, min_type, NULL); + } while (ret == -EAGAIN); + + return ret; +} + static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, @@ -4089,6 +4326,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; + struct btrfs_item_batch batch; char *ins_data; int i; struct list_head ordered_sums; @@ -4103,13 +4341,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + batch.nr = nr; for (i = 0; i < nr; i++) { ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + batch.total_data_size += ins_sizes[i]; btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); } - ret = btrfs_insert_empty_items(trans, log, dst_path, - ins_keys, ins_sizes, nr); + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) { kfree(ins_data); return ret; @@ -4321,13 +4563,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } static int log_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_root *root, + struct btrfs_inode *inode, const struct extent_map *em, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct btrfs_root *log = root->log_root; + struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct btrfs_map_token token; @@ -4340,14 +4582,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - drop_args.path = path; - drop_args.start = em->start; - drop_args.end = em->start + em->len; - drop_args.replace_extent = true; - drop_args.extent_item_size = sizeof(*fi); - ret = btrfs_drop_extents(trans, log, inode, &drop_args); - if (ret) - return ret; + /* + * If this is the first time we are logging the inode in the current + * transaction, we can avoid btrfs_drop_extents(), which is expensive + * because it does a deletion search, which always acquires write locks + * for extent buffers at levels 2, 1 and 0. This not only wastes time + * but also adds significant contention in a log tree, since log trees + * are small, with a root at level 2 or 3 at most, due to their short + * life span. + */ + if (inode_logged(trans, inode)) { + drop_args.path = path; + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); + if (ret) + return ret; + } if (!drop_args.extent_inserted) { key.objectid = btrfs_ino(inode); @@ -4505,13 +4758,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, * Avoid logging extent items logged in past fsync calls * and leading to duplicate keys in the log tree. */ - do { - ret = btrfs_truncate_inode_items(trans, - root->log_root, - inode, truncate_offset, - BTRFS_EXTENT_DATA_KEY, - NULL); - } while (ret == -EAGAIN); + ret = truncate_inode_items(trans, root->log_root, inode, + truncate_offset, + BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; @@ -4538,7 +4787,6 @@ out: } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_log_ctx *ctx) @@ -4603,7 +4851,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, ctx); + ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -4692,11 +4940,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, * with a journal, ext3/4, xfs, f2fs, etc). */ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path) { + struct btrfs_root *root = inode->root; int ret; struct btrfs_key key; const u64 ino = btrfs_ino(inode); @@ -4770,10 +5018,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, * truncate operation that changes the inode's size. */ static int btrfs_log_holes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; const u64 ino = btrfs_ino(inode); @@ -5050,7 +5298,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { ret = PTR_ERR(inode); } else { - ret = btrfs_log_inode(trans, root, + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE_ALL, ctx); @@ -5110,8 +5358,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, ctx); + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5222,7 +5469,7 @@ again: &other_ino, &other_parent); if (ret < 0) { return ret; - } else if (ret > 0 && ctx && + } else if (ret > 0 && other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { if (ins_nr > 0) { ins_nr++; @@ -5322,7 +5569,7 @@ next_key: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx) { @@ -5330,7 +5577,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path; struct btrfs_key min_key; struct btrfs_key max_key; - struct btrfs_root *log = root->log_root; + struct btrfs_root *log = inode->root->log_root; int err = 0; int ret = 0; bool fast_search = false; @@ -5372,22 +5619,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * Only run delayed items if we are a directory. We want to make sure * all directory indexes hit the fs/subvolume tree so we can find them * and figure out which index ranges have to be logged. - * - * Otherwise commit the delayed inode only if the full sync flag is set, - * as we want to make sure an up to date version is in the subvolume - * tree so copy_inode_items_to_log() / copy_items() can find it and copy - * it to the log tree. For a non full sync, we always log the inode item - * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode)) - ret = btrfs_commit_inode_delayed_items(trans, inode); - else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - ret = btrfs_commit_inode_delayed_inode(inode); - - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; + if (S_ISDIR(inode->vfs_inode.i_mode)) { + err = btrfs_commit_inode_delayed_items(trans, inode); + if (err) + goto out; } if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { @@ -5426,9 +5662,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, max_key_type); + ret = drop_inode_items(trans, log, path, inode, max_key_type); } else { - if (inode_only == LOG_INODE_EXISTS) { + if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -5450,19 +5686,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, - max_key.type); + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - while(1) { - ret = btrfs_truncate_inode_items(trans, - log, inode, 0, 0, NULL); - if (ret != -EAGAIN) - break; - } + if (inode_logged(trans, inode)) + ret = truncate_inode_items(trans, log, + inode, 0, 0); } } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags) || @@ -5470,8 +5703,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, - max_key.type); + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -5494,14 +5727,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + err = btrfs_log_all_xattrs(trans, inode, path, dst_path); if (err) goto out_unlock; xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, root, inode, path); + err = btrfs_log_holes(trans, inode, path); if (err) goto out_unlock; } @@ -5521,16 +5754,14 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, root, inode, path, - dst_path); + err = btrfs_log_all_xattrs(trans, inode, path, dst_path); if (err) goto out_unlock; btrfs_release_path(path); } } if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx); + ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); if (ret) { err = ret; goto out_unlock; @@ -5545,59 +5776,52 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path, - ctx); + ret = log_directory_changes(trans, inode, path, dst_path, ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; /* - * If we are logging that an ancestor inode exists as part of logging a - * new name from a link or rename operation, don't mark the inode as - * logged - otherwise if an explicit fsync is made against an ancestor, - * the fsync considers the inode in the log and doesn't sync the log, - * resulting in the ancestor missing after a power failure unless the - * log was synced as part of an fsync against any other unrelated inode. - * So keep it simple for this case and just don't flag the ancestors as - * logged. + * Don't update last_log_commit if we logged that an inode exists. + * We do this for three reasons: + * + * 1) We might have had buffered writes to this inode that were + * flushed and had their ordered extents completed in this + * transaction, but we did not previously log the inode with + * LOG_INODE_ALL. Later the inode was evicted and after that + * it was loaded again and this LOG_INODE_EXISTS log operation + * happened. We must make sure that if an explicit fsync against + * the inode is performed later, it logs the new extents, an + * updated inode item, etc, and syncs the log. The same logic + * applies to direct IO writes instead of buffered writes. + * + * 2) When we log the inode with LOG_INODE_EXISTS, its inode item + * is logged with an i_size of 0 or whatever value was logged + * before. If later the i_size of the inode is increased by a + * truncate operation, the log is synced through an fsync of + * some other inode and then finally an explicit fsync against + * this inode is made, we must make sure this fsync logs the + * inode with the new i_size, the hole between old i_size and + * the new i_size, and syncs the log. + * + * 3) If we are logging that an ancestor inode exists as part of + * logging a new name from a link or rename operation, don't update + * its last_log_commit - otherwise if an explicit fsync is made + * against an ancestor, the fsync considers the inode in the log + * and doesn't sync the log, resulting in the ancestor missing after + * a power failure unless the log was synced as part of an fsync + * against any other unrelated inode. */ - if (!ctx || - !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && - &inode->vfs_inode != ctx->inode)) { - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - /* - * Don't update last_log_commit if we logged that an inode exists. - * We do this for two reasons: - * - * 1) We might have had buffered writes to this inode that were - * flushed and had their ordered extents completed in this - * transaction, but we did not previously log the inode with - * LOG_INODE_ALL. Later the inode was evicted and after that - * it was loaded again and this LOG_INODE_EXISTS log operation - * happened. We must make sure that if an explicit fsync against - * the inode is performed later, it logs the new extents, an - * updated inode item, etc, and syncs the log. The same logic - * applies to direct IO writes instead of buffered writes. - * - * 2) When we log the inode with LOG_INODE_EXISTS, its inode item - * is logged with an i_size of 0 or whatever value was logged - * before. If later the i_size of the inode is increased by a - * truncate operation, the log is synced through an fsync of - * some other inode and then finally an explicit fsync against - * this inode is made, we must make sure this fsync logs the - * inode with the new i_size, the hole between old i_size and - * the new i_size, and syncs the log. - */ - if (inode_only != LOG_INODE_EXISTS) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); - } + if (inode_only != LOG_INODE_EXISTS) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); out_unlock: mutex_unlock(&inode->log_mutex); - +out: btrfs_free_path(path); btrfs_free_path(dst_path); return err; @@ -5697,6 +5921,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_dir_list *dir_elem; int ret = 0; + /* + * If we are logging a new name, as part of a link or rename operation, + * don't bother logging new dentries, as we just want to log the names + * of an inode and that any new parents exist. + */ + if (ctx->logging_new_name) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5773,7 +6005,7 @@ process_leaf: ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); btrfs_add_delayed_iput(di_inode); if (ret) @@ -5917,11 +6149,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, continue; } - if (ctx) - ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && ctx && ctx->log_new_dentries) + if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); btrfs_add_delayed_iput(dir_inode); @@ -5967,7 +6198,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation >= trans->transid && need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) @@ -6022,7 +6253,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation >= trans->transid && need_log_inode(trans, inode)) { - ret = btrfs_log_inode(trans, root, inode, + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); if (ret) break; @@ -6165,7 +6396,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); + ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6182,7 +6413,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) + if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) log_dentries = true; /* @@ -6308,8 +6539,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Failed to pin buffers while recovering log root tree."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6322,8 +6552,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_handle_fs_error(fs_info, ret, - "Couldn't find tree log root."); + btrfs_abort_transaction(trans, ret); goto error; } if (ret > 0) { @@ -6340,8 +6569,7 @@ again: log = btrfs_read_tree_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_handle_fs_error(fs_info, ret, - "Couldn't read tree log root."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6369,8 +6597,7 @@ again: if (!ret) goto next; - btrfs_handle_fs_error(fs_info, ret, - "Couldn't read target root for tree log recovery."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6378,14 +6605,15 @@ again: ret = btrfs_record_root_in_trans(trans, wc.replay_dest); if (ret) /* The loop needs to continue due to the root refs */ - btrfs_handle_fs_error(fs_info, ret, - "failed to record the log root in transaction"); + btrfs_abort_transaction(trans, ret); else ret = walk_log_tree(trans, log, &wc); if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, path); + if (ret) + btrfs_abort_transaction(trans, ret); } if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { @@ -6402,6 +6630,8 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); + if (ret) + btrfs_abort_transaction(trans, ret); } wc.replay_dest->log_root = NULL; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 731bd9c029f5..f6811c3df38a 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -17,6 +17,8 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + /* Tracks the last logged dir item/index key offset. */ + u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ @@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx); -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *dir, u64 index); -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *inode, u64 dirid); +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *dir, u64 index); +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2ec3b8ac8fa3..61ac57bcbf1a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,6 +14,7 @@ #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> +#include <linux/namei.h> #include "misc.h" #include "ctree.h" #include "extent_map.h" @@ -250,7 +251,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, int mirror_num, int need_raid_map); /* @@ -508,7 +509,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } if (flush) - filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + sync_blockdev(*bdev); ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { blkdev_put(*bdev, flags); @@ -812,9 +813,13 @@ static noinline struct btrfs_device *device_list_add(const char *path, device = NULL; } else { + struct btrfs_dev_lookup_args args = { + .devid = devid, + .uuid = disk_super->dev_item.uuid, + }; + mutex_lock(&fs_devices->device_list_mutex); - device = btrfs_find_device(fs_devices, devid, - disk_super->dev_item.uuid, NULL); + device = btrfs_find_device(fs_devices, &args); /* * If this disk has been pulled into an fs devices created by @@ -1091,7 +1096,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) __btrfs_free_extra_devids(seed_dev, &latest_dev); - fs_devices->latest_bdev = latest_dev->bdev; + fs_devices->latest_dev = latest_dev; mutex_unlock(&uuid_mutex); } @@ -1122,8 +1127,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->devid == BTRFS_DEV_REPLACE_DEVID) clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices--; + } btrfs_close_bdev(device); if (device->bdev) { @@ -1222,7 +1229,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, return -EINVAL; fs_devices->opened = 1; - fs_devices->latest_bdev = latest_dev->bdev; + fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; fs_devices->read_policy = BTRFS_READ_POLICY_PID; @@ -1286,7 +1293,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev pgoff_t index; /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ @@ -1843,8 +1850,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; + btrfs_reserve_chunk_metadata(trans, true); ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, &key, sizeof(*dev_item)); + btrfs_trans_release_chunk_metadata(trans); if (ret) goto out; @@ -1882,18 +1891,22 @@ out: /* * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. + * + * We don't care about errors here, this is just to be kind to userspace. */ -static void update_dev_time(struct block_device *bdev) +static void update_dev_time(const char *device_path) { - struct inode *inode = bdev->bd_inode; + struct path path; struct timespec64 now; + int ret; - /* Shouldn't happen but just in case. */ - if (!inode) + ret = kern_path(device_path, LOOKUP_FOLLOW, &path); + if (ret) return; - now = current_time(inode); - generic_update_time(inode, &now, S_MTIME | S_CTIME); + now = current_time(d_inode(path.dentry)); + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); + path_put(&path); } static int btrfs_rm_dev_item(struct btrfs_device *device) @@ -1917,7 +1930,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + btrfs_trans_release_chunk_metadata(trans); if (ret) { if (ret > 0) ret = -ENOENT; @@ -1986,7 +2001,7 @@ static struct btrfs_device * btrfs_find_next_active_device( } /* - * Helper function to check if the given device is part of s_bdev / latest_bdev + * Helper function to check if the given device is part of s_bdev / latest_dev * and replace it with the provided or the next active device, in the context * where this function called, there should be always be another device (or * this_dev) which is active. @@ -2005,8 +2020,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, (fs_info->sb->s_bdev == device->bdev)) fs_info->sb->s_bdev = next_device->bdev; - if (fs_info->fs_devices->latest_bdev == device->bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; + if (fs_info->fs_devices->latest_dev->bdev == device->bdev) + fs_info->fs_devices->latest_dev = next_device; } /* @@ -2069,11 +2084,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(bdev); + update_dev_time(device_path); } -int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid, struct block_device **bdev, fmode_t *mode) +int btrfs_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + struct block_device **bdev, fmode_t *mode) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2081,22 +2097,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; - mutex_lock(&uuid_mutex); - + /* + * The device list in fs_devices is accessed without locks (neither + * uuid_mutex nor device_list_mutex) as it won't change on a mounted + * filesystem and another device rm cannot run. + */ num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) goto out; - device = btrfs_find_device_by_devspec(fs_info, devid, device_path); - - if (IS_ERR(device)) { - if (PTR_ERR(device) == -ENOENT && - device_path && strcmp(device_path, "missing") == 0) + device = btrfs_find_device(fs_info->fs_devices, args); + if (!device) { + if (args->missing) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else - ret = PTR_ERR(device); + ret = -ENOENT; goto out; } @@ -2126,11 +2143,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&fs_info->chunk_mutex); } - mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); if (!ret) btrfs_reada_remove_dev(device); - mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -2159,7 +2174,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, /* * In normal cases the cur_devices == fs_devices. But in case * of deleting a seed device, the cur_devices should point to - * its own fs_devices listed under the fs_devices->seed. + * its own fs_devices listed under the fs_devices->seed_list. */ cur_devices = device->fs_devices; mutex_lock(&fs_devices->device_list_mutex); @@ -2210,14 +2225,21 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, synchronize_rcu(); btrfs_free_device(device); - if (cur_devices->open_devices == 0) { + /* + * This can happen if cur_devices is the private seed devices list. We + * cannot call close_fs_devices() here because it expects the uuid_mutex + * to be held, but in fact we don't need that for the private + * seed_devices, we can simply decrement cur_devices->opened and then + * remove it from our list and free the fs_devices. + */ + if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); - close_fs_devices(cur_devices); + ASSERT(cur_devices->opened == 1); + cur_devices->opened--; free_fs_devices(cur_devices); } out: - mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -2305,13 +2327,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - /* - * The update_dev_time() with in btrfs_scratch_superblocks() - * may lead to a call to btrfs_show_devname() which will try - * to hold device_list_mutex. And here this device - * is already out of device list, so we don't have to hold - * the device_list_mutex lock. - */ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, tgtdev->name->str); @@ -2320,69 +2335,98 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_free_device(tgtdev); } -static struct btrfs_device *btrfs_find_device_by_path( - struct btrfs_fs_info *fs_info, const char *device_path) +/** + * Populate args from device at path + * + * @fs_info: the filesystem + * @args: the args to populate + * @path: the path to the device + * + * This will read the super block of the device at @path and populate @args with + * the devid, fsid, and uuid. This is meant to be used for ioctls that need to + * lookup a device to operate on, but need to do it before we take any locks. + * This properly handles the special case of "missing" that a user may pass in, + * and does some basic sanity checks. The caller must make sure that @path is + * properly NUL terminated before calling in, and must call + * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and + * uuid buffers. + * + * Return: 0 for success, -errno for failure + */ +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path) { - int ret = 0; struct btrfs_super_block *disk_super; - u64 devid; - u8 *dev_uuid; struct block_device *bdev; - struct btrfs_device *device; + int ret; - ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - fs_info->bdev_holder, 0, &bdev, &disk_super); - if (ret) - return ERR_PTR(ret); + if (!path || !path[0]) + return -EINVAL; + if (!strcmp(path, "missing")) { + args->missing = true; + return 0; + } - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; + args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); + args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); + if (!args->uuid || !args->fsid) { + btrfs_put_dev_args_from_path(args); + return -ENOMEM; + } + + ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, + &bdev, &disk_super); + if (ret) + return ret; + args->devid = btrfs_stack_device_id(&disk_super->dev_item); + memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->metadata_uuid); + memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); else - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid); - + memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - if (!device) - device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); - return device; + return 0; } /* - * Lookup a device given by device id, or the path if the id is 0. + * Only use this jointly with btrfs_get_dev_args_from_path() because we will + * allocate our ->uuid and ->fsid pointers, everybody else uses local variables + * that don't need to be freed. */ +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) +{ + kfree(args->uuid); + kfree(args->fsid); + args->uuid = NULL; + args->fsid = NULL; +} + struct btrfs_device *btrfs_find_device_by_devspec( struct btrfs_fs_info *fs_info, u64 devid, const char *device_path) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *device; + int ret; if (devid) { - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, - NULL); + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) return ERR_PTR(-ENOENT); return device; } - if (!device_path || !device_path[0]) - return ERR_PTR(-EINVAL); - - if (strcmp(device_path, "missing") == 0) { - /* Find first missing device */ - list_for_each_entry(device, &fs_info->fs_devices->devices, - dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state) && !device->bdev) - return device; - } + ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); + if (ret) + return ERR_PTR(ret); + device = btrfs_find_device(fs_info->fs_devices, &args); + btrfs_put_dev_args_from_path(&args); + if (!device) return ERR_PTR(-ENOENT); - } - - return btrfs_find_device_by_path(fs_info, device_path); + return device; } /* @@ -2459,6 +2503,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; @@ -2468,7 +2513,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) struct btrfs_key key; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - u64 devid; int ret; path = btrfs_alloc_path(); @@ -2480,7 +2524,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) key.type = BTRFS_DEV_ITEM_KEY; while (1) { + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + btrfs_trans_release_chunk_metadata(trans); if (ret < 0) goto error; @@ -2505,13 +2551,14 @@ next_slot: dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); - devid = btrfs_device_id(leaf, dev_item); + args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid); + args.uuid = dev_uuid; + args.fsid = fs_uuid; + device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -2610,8 +2657,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = round_down(i_size_read(bdev->bd_inode), - fs_info->sectorsize); + device->total_bytes = + round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); @@ -2627,6 +2674,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_abort_transaction(trans, ret); goto error_trans; } + btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, + device); } device->fs_devices = fs_devices; @@ -2733,7 +2782,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_forget_devices(device_path); /* Update ctime/mtime for blkid or udev */ - update_dev_time(bdev); + update_dev_time(device_path); return ret; @@ -2826,6 +2875,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total; u64 diff; + int ret; if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; @@ -2854,7 +2904,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); - return btrfs_update_device(trans, device); + btrfs_reserve_chunk_metadata(trans, false); + ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); + + return ret; } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) @@ -3096,7 +3150,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; - sys_bg = btrfs_alloc_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -4889,8 +4943,10 @@ again: round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); + btrfs_reserve_chunk_metadata(trans, false); /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); if (ret < 0) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4973,7 +5029,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) } /* - * Structure used internally for __btrfs_alloc_chunk() function. + * Structure used internally for btrfs_create_chunk() function. * Wraps needed parameters. */ struct alloc_chunk_ctl { @@ -5377,7 +5433,7 @@ error_del_extent: return block_group; } -struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type) { struct btrfs_fs_info *info = trans->fs_info; @@ -5578,12 +5634,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - meta_bg = btrfs_alloc_chunk(trans, alloc_profile); + meta_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - sys_bg = btrfs_alloc_chunk(trans, alloc_profile); + sys_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); @@ -5597,17 +5653,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) return btrfs_raid_array[index].tolerated_failures; } -int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - int readonly = 0; int miss_ndevs = 0; int i; + bool ret = true; em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) - return 1; + return false; map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { @@ -5618,21 +5674,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &map->stripes[i].dev->dev_state)) { - readonly = 1; + ret = false; goto end; } } /* - * If the number of missing devices is larger than max errors, - * we can not write the data into that chunk successfully, so - * set it readonly. + * If the number of missing devices is larger than max errors, we can + * not write the data into that chunk successfully. */ if (miss_ndevs > btrfs_chunk_max_errors(map)) - readonly = 1; + ret = false; end: free_extent_map(em); - return readonly; + return ret; } void btrfs_mapping_tree_free(struct extent_map_tree *tree) @@ -5795,7 +5850,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) +static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) { int i; int again = 1; @@ -5804,52 +5859,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) again = 0; for (i = 0; i < num_stripes - 1; i++) { /* Swap if parity is on a smaller index */ - if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { - swap(bbio->stripes[i], bbio->stripes[i + 1]); - swap(bbio->raid_map[i], bbio->raid_map[i + 1]); + if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { + swap(bioc->stripes[i], bioc->stripes[i + 1]); + swap(bioc->raid_map[i], bioc->raid_map[i + 1]); again = 1; } } } } -static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + int total_stripes, + int real_stripes) { - struct btrfs_bio *bbio = kzalloc( - /* the size of the btrfs_bio */ - sizeof(struct btrfs_bio) + - /* plus the variable array for the stripes */ - sizeof(struct btrfs_bio_stripe) * (total_stripes) + - /* plus the variable array for the tgt dev */ + struct btrfs_io_context *bioc = kzalloc( + /* The size of btrfs_io_context */ + sizeof(struct btrfs_io_context) + + /* Plus the variable array for the stripes */ + sizeof(struct btrfs_io_stripe) * (total_stripes) + + /* Plus the variable array for the tgt dev */ sizeof(int) * (real_stripes) + /* - * plus the raid_map, which includes both the tgt dev - * and the stripes + * Plus the raid_map, which includes both the tgt dev + * and the stripes. */ sizeof(u64) * (total_stripes), GFP_NOFS|__GFP_NOFAIL); - atomic_set(&bbio->error, 0); - refcount_set(&bbio->refs, 1); + atomic_set(&bioc->error, 0); + refcount_set(&bioc->refs, 1); - bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); - bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); + bioc->fs_info = fs_info; + bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); + bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); - return bbio; + return bioc; } -void btrfs_get_bbio(struct btrfs_bio *bbio) +void btrfs_get_bioc(struct btrfs_io_context *bioc) { - WARN_ON(!refcount_read(&bbio->refs)); - refcount_inc(&bbio->refs); + WARN_ON(!refcount_read(&bioc->refs)); + refcount_inc(&bioc->refs); } -void btrfs_put_bbio(struct btrfs_bio *bbio) +void btrfs_put_bioc(struct btrfs_io_context *bioc) { - if (!bbio) + if (!bioc) return; - if (refcount_dec_and_test(&bbio->refs)) - kfree(bbio); + if (refcount_dec_and_test(&bioc->refs)) + kfree(bioc); } /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ @@ -5859,11 +5917,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) */ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, - struct btrfs_bio **bbio_ret) + struct btrfs_io_context **bioc_ret) { struct extent_map *em; struct map_lookup *map; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5882,8 +5940,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, int ret = 0; int i; - /* discard always return a bbio */ - ASSERT(bbio_ret); + /* Discard always returns a bioc. */ + ASSERT(bioc_ret); em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) @@ -5946,26 +6004,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &stripe_index); } - bbio = alloc_btrfs_bio(num_stripes, 0); - if (!bbio) { + bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); + if (!bioc) { ret = -ENOMEM; goto out; } for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = + bioc->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; + bioc->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * + bioc->stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; + bioc->stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -5976,19 +6033,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; + bioc->stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; + bioc->stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bbio->stripes[i].length = length; + bioc->stripes[i].length = length; } stripe_index++; @@ -5998,9 +6053,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bbio_ret = bbio; - bbio->map_type = map->type; - bbio->num_stripes = num_stripes; + *bioc_ret = bioc; + bioc->map_type = map->type; + bioc->num_stripes = num_stripes; out: free_extent_map(em); return ret; @@ -6024,7 +6079,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, u64 srcdev_devid, int *mirror_num, u64 *physical) { - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int num_stripes; int index_srcdev = 0; int found = 0; @@ -6033,20 +6088,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, int ret = 0; ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bbio, 0, 0); + logical, &length, &bioc, 0, 0); if (ret) { - ASSERT(bbio == NULL); + ASSERT(bioc == NULL); return ret; } - num_stripes = bbio->num_stripes; + num_stripes = bioc->num_stripes; if (*mirror_num > num_stripes) { /* * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, * that means that the requested area is not left of the left * cursor */ - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return -EIO; } @@ -6056,7 +6111,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, * pointer to the one of the target drive. */ for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid != srcdev_devid) + if (bioc->stripes[i].dev->devid != srcdev_devid) continue; /* @@ -6064,15 +6119,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, * mirror with the lowest physical address */ if (found && - physical_of_found <= bbio->stripes[i].physical) + physical_of_found <= bioc->stripes[i].physical) continue; index_srcdev = i; found = 1; - physical_of_found = bbio->stripes[i].physical; + physical_of_found = bioc->stripes[i].physical; } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); ASSERT(found); if (!found) @@ -6103,12 +6158,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) } static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, struct btrfs_dev_replace *dev_replace, u64 logical, int *num_stripes_ret, int *max_errors_ret) { - struct btrfs_bio *bbio = *bbio_ret; + struct btrfs_io_context *bioc = *bioc_ret; u64 srcdev_devid = dev_replace->srcdev->devid; int tgtdev_indexes = 0; int num_stripes = *num_stripes_ret; @@ -6138,17 +6193,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, */ index_where_to_add = num_stripes; for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { + if (bioc->stripes[i].dev->devid == srcdev_devid) { /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; + struct btrfs_io_stripe *new = + bioc->stripes + index_where_to_add; + struct btrfs_io_stripe *old = + bioc->stripes + i; new->physical = old->physical; new->length = old->length; new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; + bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; max_errors++; tgtdev_indexes++; @@ -6168,30 +6223,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, * full copy of the source drive. */ for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { + if (bioc->stripes[i].dev->devid == srcdev_devid) { /* * In case of DUP, in order to keep it simple, * only add the mirror with the lowest physical * address */ if (found && - physical_of_found <= - bbio->stripes[i].physical) + physical_of_found <= bioc->stripes[i].physical) continue; index_srcdev = i; found = 1; - physical_of_found = bbio->stripes[i].physical; + physical_of_found = bioc->stripes[i].physical; } } if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_io_stripe *tgtdev_stripe = + bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; + bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + bioc->tgtdev_map[index_srcdev] = num_stripes; tgtdev_indexes++; num_stripes++; @@ -6200,8 +6254,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, *num_stripes_ret = num_stripes; *max_errors_ret = max_errors; - bbio->num_tgtdevs = tgtdev_indexes; - *bbio_ret = bbio; + bioc->num_tgtdevs = tgtdev_indexes; + *bioc_ret = bioc; } static bool need_full_stripe(enum btrfs_map_op op) @@ -6304,7 +6358,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, int mirror_num, int need_raid_map) { struct extent_map *em; @@ -6319,7 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int num_stripes; int max_errors = 0; int tgtdev_indexes = 0; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; int num_alloc_stripes; @@ -6328,7 +6382,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 raid56_full_stripe_start = (u64)-1; struct btrfs_io_geometry geom; - ASSERT(bbio_ret); + ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); em = btrfs_get_chunk_map(fs_info, logical, *length); @@ -6472,20 +6526,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, tgtdev_indexes = num_stripes; } - bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); - if (!bbio) { + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); + if (!bioc) { ret = -ENOMEM; goto out; } for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = map->stripes[stripe_index].physical + + bioc->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; + bioc->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } - /* build raid_map */ + /* Build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; @@ -6497,15 +6551,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* Fill in the logical address of each stripe */ tmp = stripe_nr * data_stripes; for (i = 0; i < data_stripes; i++) - bbio->raid_map[(i+rot) % num_stripes] = + bioc->raid_map[(i + rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; - bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; if (map->type & BTRFS_BLOCK_GROUP_RAID6) - bbio->raid_map[(i+rot+1) % num_stripes] = + bioc->raid_map[(i + rot + 1) % num_stripes] = RAID6_Q_STRIPE; - sort_parity_stripes(bbio, num_stripes); + sort_parity_stripes(bioc, num_stripes); } if (need_full_stripe(op)) @@ -6513,15 +6567,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, &num_stripes, &max_errors); } - *bbio_ret = bbio; - bbio->map_type = map->type; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + *bioc_ret = bioc; + bioc->map_type = map->type; + bioc->num_stripes = num_stripes; + bioc->max_errors = max_errors; + bioc->mirror_num = mirror_num; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -6530,9 +6584,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, */ if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { WARN_ON(num_stripes > 1); - bbio->stripes[0].dev = dev_replace->tgtdev; - bbio->stripes[0].physical = physical_to_patch_in_first_stripe; - bbio->mirror_num = map->num_stripes + 1; + bioc->stripes[0].dev = dev_replace->tgtdev; + bioc->stripes[0].physical = physical_to_patch_in_first_stripe; + bioc->mirror_num = map->num_stripes + 1; } out: if (dev_replace_is_ongoing) { @@ -6546,43 +6600,43 @@ out: int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num) + struct btrfs_io_context **bioc_ret, int mirror_num) { if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, - length, bbio_ret); + length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, mirror_num, 0); } /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret) + struct btrfs_io_context **bioc_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) +static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) { - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; + bio->bi_private = bioc->private; + bio->bi_end_io = bioc->end_io; bio_endio(bio); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); } static void btrfs_end_bio(struct bio *bio) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_io_context *bioc = bio->bi_private; int is_orig_bio = 0; if (bio->bi_status) { - atomic_inc(&bbio->error); + atomic_inc(&bioc->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_io_bio(bio)->device; + struct btrfs_device *dev = btrfs_bio(bio)->device; ASSERT(dev->bdev); if (btrfs_op(bio) == BTRFS_MAP_WRITE) @@ -6597,22 +6651,22 @@ static void btrfs_end_bio(struct bio *bio) } } - if (bio == bbio->orig_bio) + if (bio == bioc->orig_bio) is_orig_bio = 1; - btrfs_bio_counter_dec(bbio->fs_info); + btrfs_bio_counter_dec(bioc->fs_info); - if (atomic_dec_and_test(&bbio->stripes_pending)) { + if (atomic_dec_and_test(&bioc->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = bbio->orig_bio; + bio = bioc->orig_bio; } - btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + btrfs_bio(bio)->mirror_num = bioc->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio */ - if (atomic_read(&bbio->error) > bbio->max_errors) { + if (atomic_read(&bioc->error) > bioc->max_errors) { bio->bi_status = BLK_STS_IOERR; } else { /* @@ -6622,19 +6676,19 @@ static void btrfs_end_bio(struct bio *bio) bio->bi_status = BLK_STS_OK; } - btrfs_end_bbio(bbio, bio); + btrfs_end_bioc(bioc, bio); } else if (!is_orig_bio) { bio_put(bio); } } -static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, +static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, u64 physical, struct btrfs_device *dev) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bioc->fs_info; - bio->bi_private = bbio; - btrfs_io_bio(bio)->device = dev; + bio->bi_private = bioc; + btrfs_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; /* @@ -6663,20 +6717,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfsic_submit_bio(bio); } -static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) { - atomic_inc(&bbio->error); - if (atomic_dec_and_test(&bbio->stripes_pending)) { + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) { /* Should be the original bio. */ - WARN_ON(bio != bbio->orig_bio); + WARN_ON(bio != bioc->orig_bio); - btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + btrfs_bio(bio)->mirror_num = bioc->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bbio->error) > bbio->max_errors) + if (atomic_read(&bioc->error) > bioc->max_errors) bio->bi_status = BLK_STS_IOERR; else bio->bi_status = BLK_STS_OK; - btrfs_end_bbio(bbio, bio); + btrfs_end_bioc(bioc, bio); } } @@ -6691,36 +6745,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int ret; int dev_nr; int total_devs; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; length = bio->bi_iter.bi_size; map_length = length; btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, - &map_length, &bbio, mirror_num, 1); + &map_length, &bioc, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); return errno_to_blk_status(ret); } - total_devs = bbio->num_stripes; - bbio->orig_bio = first_bio; - bbio->private = first_bio->bi_private; - bbio->end_io = first_bio->bi_end_io; - bbio->fs_info = fs_info; - atomic_set(&bbio->stripes_pending, bbio->num_stripes); + total_devs = bioc->num_stripes; + bioc->orig_bio = first_bio; + bioc->private = first_bio->bi_private; + bioc->end_io = first_bio->bi_end_io; + atomic_set(&bioc->stripes_pending, bioc->num_stripes); - if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(fs_info, bio, bbio, - map_length); + ret = raid56_parity_write(bio, bioc, map_length); } else { - ret = raid56_parity_recover(fs_info, bio, bbio, - map_length, mirror_num, 1); + ret = raid56_parity_recover(bio, bioc, map_length, + mirror_num, 1); } btrfs_bio_counter_dec(fs_info); @@ -6735,12 +6787,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bbio->stripes[dev_nr].dev; + dev = bioc->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bbio_error(bbio, first_bio, logical); + bioc_error(bioc, first_bio, logical); continue; } @@ -6749,12 +6801,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, else bio = first_bio; - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); + submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; } +static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, + const struct btrfs_fs_devices *fs_devices) +{ + if (args->fsid == NULL) + return true; + if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) + return true; + return false; +} + +static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, + const struct btrfs_device *device) +{ + ASSERT((args->devid != (u64)-1) || args->missing); + + if ((args->devid != (u64)-1) && device->devid != args->devid) + return false; + if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) + return false; + if (!args->missing) + return true; + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && + !device->bdev) + return true; + return false; +} + /* * Find a device specified by @devid or @uuid in the list of @fs_devices, or * return NULL. @@ -6762,31 +6841,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. */ -struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid) +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; - if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + if (dev_args_match_fs_devices(args, fs_devices)) { list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (device->devid == devid && - (!uuid || memcmp(device->uuid, uuid, - BTRFS_UUID_SIZE) == 0)) + if (dev_args_match_device(args, device)) return device; } } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - if (!fsid || - !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &seed_devs->devices, - dev_list) { - if (device->devid == devid && - (!uuid || memcmp(device->uuid, uuid, - BTRFS_UUID_SIZE) == 0)) - return device; - } + if (!dev_args_match_fs_devices(args, seed_devs)) + continue; + list_for_each_entry(device, &seed_devs->devices, dev_list) { + if (dev_args_match_device(args, device)) + return device; } } @@ -6952,6 +7025,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; @@ -7029,11 +7103,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); devid = btrfs_stripe_devid_nr(leaf, chunk, i); + args.devid = devid; read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL); + args.uuid = uuid; + map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -7151,6 +7226,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; @@ -7159,11 +7235,13 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = btrfs_device_id(leaf, dev_item); + devid = args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); + args.uuid = dev_uuid; + args.fsid = fs_uuid; if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); @@ -7171,8 +7249,7 @@ static int read_one_dev(struct extent_buffer *leaf, return PTR_ERR(fs_devices); } - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid); + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -7236,7 +7313,7 @@ static int read_one_dev(struct extent_buffer *leaf, fill_device_from_item(leaf, dev_item, device); if (device->bdev) { - u64 max_total_bytes = i_size_read(device->bdev->bd_inode); + u64 max_total_bytes = bdev_nr_bytes(device->bdev); if (device->total_bytes > max_total_bytes) { btrfs_err(fs_info, @@ -7841,12 +7918,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *dev; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); + args.devid = stats->devid; + dev = btrfs_find_device(fs_info->fs_devices, &args); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7922,6 +8001,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; @@ -7977,7 +8057,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device boundary */ - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2183361db614..3b8130680749 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -236,17 +236,40 @@ struct btrfs_fs_devices { bool fsid_change; struct list_head fs_list; + /* + * Number of devices under this fsid including missing and + * replace-target device and excludes seed devices. + */ u64 num_devices; + + /* + * The number of devices that successfully opened, including + * replace-target, excludes seed devices. + */ u64 open_devices; + + /* The number of devices that are under the chunk allocation list. */ u64 rw_devices; + + /* Count of missing devices under this fsid excluding seed device. */ u64 missing_devices; u64 total_rw_bytes; + + /* + * Count of devices from btrfs_super_block::num_devices for this fsid, + * which includes the seed device, excludes the transient replace-target + * device. + */ u64 total_devices; /* Highest generation number of seen devices */ u64 latest_generation; - struct block_device *latest_bdev; + /* + * The mount device or a device with highest generation after removal + * or replace. + */ + struct btrfs_device *latest_dev; /* all of the devices in the FS, protected by a mutex * so we can safely walk it to write out the supers without @@ -300,48 +323,62 @@ struct btrfs_fs_devices { / sizeof(struct btrfs_stripe) + 1) /* - * we need the mirror number and stripe index to be passed around - * the call chain while we are processing end_io (especially errors). - * Really, what we need is a btrfs_bio structure that has this info - * and is properly sized with its stripe array, but we're not there - * quite yet. We have our own btrfs bioset, and all of the bios - * we allocate are actually btrfs_io_bios. We'll cram as much of - * struct btrfs_bio as we can into this over time. + * Additional info to pass along bio. + * + * Mostly for btrfs specific features like csum and mirror_num. */ -struct btrfs_io_bio { +struct btrfs_bio { unsigned int mirror_num; + + /* @device is for stripe IO submission. */ struct btrfs_device *device; - u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; struct bvec_iter iter; + /* * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire btrfs_io_bio but relies on bio being last. + * bytes for entire btrfs_bio but relies on bio being last. */ struct bio bio; }; -static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) { - return container_of(bio, struct btrfs_io_bio, bio); + return container_of(bio, struct btrfs_bio, bio); } -static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) { - if (io_bio->csum != io_bio->csum_inline) { - kfree(io_bio->csum); - io_bio->csum = NULL; + if (bbio->csum != bbio->csum_inline) { + kfree(bbio->csum); + bbio->csum = NULL; } } -struct btrfs_bio_stripe { +struct btrfs_io_stripe { struct btrfs_device *dev; u64 physical; u64 length; /* only used for discard mappings */ }; -struct btrfs_bio { +/* + * Context for IO subsmission for device stripe. + * + * - Track the unfinished mirrors for mirror based profiles + * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. + * + * - Contain the logical -> physical mapping info + * Used by submit_stripe_bio() for mapping logical bio + * into physical device address. + * + * - Contain device replace info + * Used by handle_ops_on_dev_replace() to copy logical bios + * into the new device. + * + * - Contain RAID56 full stripe logical bytenrs + */ +struct btrfs_io_context { refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; @@ -361,7 +398,7 @@ struct btrfs_bio { * so raid_map[0] is the start of our full stripe */ u64 *raid_map; - struct btrfs_bio_stripe stripes[]; + struct btrfs_io_stripe stripes[]; }; struct btrfs_device_info { @@ -396,11 +433,11 @@ struct map_lookup { int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ - struct btrfs_bio_stripe stripes[]; + struct btrfs_io_stripe stripes[]; }; #define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) + (sizeof(struct btrfs_io_stripe) * (n))) struct btrfs_balance_args; struct btrfs_balance_progress; @@ -414,6 +451,22 @@ struct btrfs_balance_control { struct btrfs_balance_progress stat; }; +/* + * Search for a given device by the set parameters + */ +struct btrfs_dev_lookup_args { + u64 devid; + u8 *uuid; + u8 *fsid; + bool missing; +}; + +/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */ +#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 } + +#define BTRFS_DEV_LOOKUP_ARGS(name) \ + struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT + enum btrfs_map_op { BTRFS_MAP_READ, BTRFS_MAP_WRITE, @@ -437,20 +490,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } -void btrfs_get_bbio(struct btrfs_bio *bbio); -void btrfs_put_bbio(struct btrfs_bio *bbio); +void btrfs_get_bioc(struct btrfs_io_context *bioc); +void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num); + struct btrfs_io_context **bioc_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret); + struct btrfs_io_context **bioc_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); -struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, @@ -467,19 +520,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, const char *devpath); +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, - const char *device_path, u64 devid, + struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid); +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -493,7 +550,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); -int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 8a4514283a4b..2837b4c8424d 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * matches our target xattr, so lets check. */ ret = 0; - btrfs_assert_tree_locked(path->nodes[0]); + btrfs_assert_tree_write_locked(path->nodes[0]); di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8afa90074891..767a0c6c9694 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (in_page) + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); put_page(in_page); + } return ret; } @@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); + kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - + kunmap(pages_in[page_in_index]); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; done: zlib_inflateEnd(&workspace->strm); + if (data_in) + kunmap(pages_in[page_in_index]); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 47af1ab3bf12..67d932d70798 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -4,6 +4,7 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> +#include <linux/atomic.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" @@ -39,12 +40,30 @@ #define BTRFS_NR_SB_LOG_ZONES 2 /* + * Minimum of active zones we need: + * + * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors + * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group + * - 1 zone for tree-log dedicated block group + * - 1 zone for relocation + */ +#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) + +/* * Maximum supported zone size. Currently, SMR disks have a zone size of * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not * expect the zone size to become larger than 8GiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) + +static inline bool sb_zone_is_full(const struct blk_zone *zone) +{ + return (zone->cond == BLK_ZONE_COND_FULL) || + (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); +} + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -60,14 +79,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, bool empty[BTRFS_NR_SB_LOG_ZONES]; bool full[BTRFS_NR_SB_LOG_ZONES]; sector_t sector; + int i; - ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && - zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); - - empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); - empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); - full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); - full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); + empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); + full[i] = sb_zone_is_full(&zones[i]); + } /* * Possible states of log buffer zones @@ -296,6 +314,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; + struct request_queue *queue = bdev_get_queue(bdev); + unsigned int max_active_zones; + unsigned int nactive; sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; @@ -351,6 +372,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; + max_active_zones = queue_max_active_zones(queue); + if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { + btrfs_err_in_rcu(fs_info, +"zoned: %s: max active zones %u is too small, need at least %u active zones", + rcu_str_deref(device->name), max_active_zones, + BTRFS_MIN_ACTIVE_ZONES); + ret = -EINVAL; + goto out; + } + zone_info->max_active_zones = max_active_zones; + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; @@ -363,6 +395,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) { + ret = -ENOMEM; + goto out; + } + zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; @@ -370,6 +408,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) } /* Get zones type */ + nactive = 0; while (sector < nr_sectors) { nr_zones = BTRFS_REPORT_NR_ZONES; ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, @@ -380,8 +419,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) for (i = 0; i < nr_zones; i++) { if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) __set_bit(nreported, zone_info->seq_zones); - if (zones[i].cond == BLK_ZONE_COND_EMPTY) + switch (zones[i].cond) { + case BLK_ZONE_COND_EMPTY: __set_bit(nreported, zone_info->empty_zones); + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + __set_bit(nreported, zone_info->active_zones); + nactive++; + break; + } nreported++; } sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; @@ -396,6 +444,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + if (max_active_zones) { + if (nactive > max_active_zones) { + btrfs_err_in_rcu(device->fs_info, + "zoned: %u active zones on %s exceeds max_active_zones %u", + nactive, rcu_str_deref(device->name), + max_active_zones); + ret = -EIO; + goto out; + } + atomic_set(&zone_info->active_zones_left, + max_active_zones - nactive); + } + /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -478,6 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) out: kfree(zones); out_free_zone_info: + bitmap_free(zone_info->active_zones); bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->seq_zones); kfree(zone_info); @@ -493,6 +555,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) if (!zone_info) return; + bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); kfree(zone_info); @@ -585,7 +648,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) /* * stripe_size is always aligned to BTRFS_STRIPE_LEN in - * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, + * btrfs_create_chunk(). Since we want stripe_len == zone_size, * check the alignment here. */ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { @@ -664,7 +727,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { - ASSERT(reset->cond == BLK_ZONE_COND_FULL); + ASSERT(sb_zone_is_full(reset)); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, reset->start, reset->len, @@ -676,9 +739,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset->wp = reset->start; } } else if (ret != -ENOENT) { - /* For READ, we want the precious one */ + /* + * For READ, we want the previous one. Move write pointer to + * the end of a zone, if it is at the head of a zone. + */ + u64 zone_end = 0; + if (wp == zones[0].start << SECTOR_SHIFT) - wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; + zone_end = zones[1].start + zones[1].capacity; + else if (wp == zones[1].start << SECTOR_SHIFT) + zone_end = zones[0].start + zones[0].capacity; + if (zone_end) + wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, + BTRFS_SUPER_INFO_SIZE); + wp -= BTRFS_SUPER_INFO_SIZE; } @@ -771,36 +845,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, return true; } -void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) { struct btrfs_zoned_device_info *zinfo = device->zone_info; struct blk_zone *zone; + int i; if (!is_sb_log_zone(zinfo, mirror)) - return; + return 0; zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; - if (zone->cond != BLK_ZONE_COND_FULL) { + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + /* Advance the next zone */ + if (zone->cond == BLK_ZONE_COND_FULL) { + zone++; + continue; + } + if (zone->cond == BLK_ZONE_COND_EMPTY) zone->cond = BLK_ZONE_COND_IMP_OPEN; - zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); + zone->wp += SUPER_INFO_SECTORS; + + if (sb_zone_is_full(zone)) { + /* + * No room left to write new superblock. Since + * superblock is written with REQ_SYNC, it is safe to + * finish the zone now. + * + * If the write pointer is exactly at the capacity, + * explicit ZONE_FINISH is not necessary. + */ + if (zone->wp != zone->start + zone->capacity) { + int ret; + + ret = blkdev_zone_mgmt(device->bdev, + REQ_OP_ZONE_FINISH, zone->start, + zone->len, GFP_NOFS); + if (ret) + return ret; + } - if (zone->wp == zone->start + zone->len) + zone->wp = zone->start + zone->len; zone->cond = BLK_ZONE_COND_FULL; - - return; + } + return 0; } - zone++; - ASSERT(zone->cond != BLK_ZONE_COND_FULL); - if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); - - if (zone->wp == zone->start + zone->len) - zone->cond = BLK_ZONE_COND_FULL; + /* All the zones are FULL. Should not reach here. */ + ASSERT(0); + return -EIO; } int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) @@ -895,6 +989,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, return pos; } +static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return true; + + if (!test_bit(zno, zone_info->active_zones)) { + /* Active zone left? */ + if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) + return false; + if (test_and_set_bit(zno, zone_info->active_zones)) { + /* Someone already set the bit */ + atomic_inc(&zone_info->active_zones_left); + } + } + + return true; +} + +static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return; + + if (test_and_clear_bit(zno, zone_info->active_zones)) + atomic_inc(&zone_info->active_zones_left); +} + int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { @@ -910,6 +1039,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, *bytes = length; while (length) { btrfs_dev_set_zone_empty(device, physical); + btrfs_dev_clear_active_zone(device, physical); physical += device->zone_info->zone_size; length -= device->zone_info->zone_size; } @@ -1039,6 +1169,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; + u64 *caps = NULL; + unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1063,10 +1195,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) map = em->map_lookup; + cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); + if (!cache->physical_map) { + ret = -ENOMEM; + goto out; + } + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); if (!alloc_offsets) { - free_extent_map(em); - return -ENOMEM; + ret = -ENOMEM; + goto out; + } + + caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); + if (!caps) { + ret = -ENOMEM; + goto out; + } + + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); + if (!active) { + ret = -ENOMEM; + goto out; } for (i = 0; i < map->num_stripes; i++) { @@ -1131,6 +1281,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + caps[i] = (zone.capacity << SECTOR_SHIFT); + switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: @@ -1144,14 +1296,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) alloc_offsets[i] = 0; break; case BLK_ZONE_COND_FULL: - alloc_offsets[i] = fs_info->zone_size; + alloc_offsets[i] = caps[i]; break; default: /* Partially used zone */ alloc_offsets[i] = ((zone.wp - zone.start) << SECTOR_SHIFT); + __set_bit(i, active); break; } + + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); } if (num_sequential > 0) @@ -1169,6 +1329,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * calculate_alloc_pointer() which takes extent buffer * locks to avoid deadlock. */ + + /* Zone capacity is always zone size in emulation */ + cache->zone_capacity = cache->length; if (new) { cache->alloc_offset = 0; goto out; @@ -1195,6 +1358,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = caps[0]; + cache->zone_is_active = test_bit(0, active); break; case BTRFS_BLOCK_GROUP_DUP: case BTRFS_BLOCK_GROUP_RAID1: @@ -1210,6 +1375,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (cache->zone_is_active) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1218,6 +1390,14 @@ out: ret = -EIO; } + if (cache->alloc_offset > cache->zone_capacity) { + btrfs_err(fs_info, +"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", + cache->alloc_offset, cache->zone_capacity, + cache->start); + ret = -EIO; + } + /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, @@ -1229,6 +1409,12 @@ out: if (!ret) cache->meta_write_pointer = cache->alloc_offset + cache->start; + if (ret) { + kfree(cache->physical_map); + cache->physical_map = NULL; + } + bitmap_free(active); + kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1243,17 +1429,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - unusable = cache->alloc_offset - cache->used; - free = cache->length - cache->alloc_offset; + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; - - /* Should not have any excluded extents. Just in case, though */ - btrfs_free_excluded_extents(cache); } void btrfs_redirty_list_add(struct btrfs_transaction *trans, @@ -1304,6 +1488,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + /* + * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * extent layout the relocation code has. + * Furthermore we have set aside own block-group from which only the + * relocation "process" can allocate and make sure only one process at a + * time can add pages to an extent that gets relocated, so it's safe to + * use regular REQ_OP_WRITE for this special case. + */ + if (btrfs_is_data_reloc_root(inode->root)) + return false; + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) @@ -1440,27 +1635,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, struct blk_zone *zone) { - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 mapped_length = PAGE_SIZE; unsigned int nofs_flag; int nmirrors; int i, ret; ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bbio); - if (ret || !bbio || mapped_length < PAGE_SIZE) { - btrfs_put_bbio(bbio); + &mapped_length, &bioc); + if (ret || !bioc || mapped_length < PAGE_SIZE) { + btrfs_put_bioc(bioc); return -EIO; } - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) return -EINVAL; nofs_flag = memalloc_nofs_save(); - nmirrors = (int)bbio->num_stripes; + nmirrors = (int)bioc->num_stripes; for (i = 0; i < nmirrors; i++) { - u64 physical = bbio->stripes[i].physical; - struct btrfs_device *dev = bbio->stripes[i].dev; + u64 physical = bioc->stripes[i].physical; + struct btrfs_device *dev = bioc->stripes[i].dev; /* Missing device */ if (!dev->bdev) @@ -1530,3 +1725,251 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, return device; } + +/** + * Activate block group and underlying device zones + * + * @block_group: the block group to activate + * + * Return: true on success, false otherwise + */ +bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + bool ret; + + if (!btrfs_is_zoned(block_group->fs_info)) + return true; + + map = block_group->physical_map; + /* Currently support SINGLE profile only */ + ASSERT(map->num_stripes == 1); + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (device->zone_info->max_active_zones == 0) + return true; + + spin_lock(&block_group->lock); + + if (block_group->zone_is_active) { + ret = true; + goto out_unlock; + } + + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; + + spin_unlock(&block_group->lock); + + /* For the active block group list */ + btrfs_get_block_group(block_group); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(list_empty(&block_group->active_bg_list)); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + + return true; + +out_unlock: + spin_unlock(&block_group->lock); + return ret; +} + +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + map = block_group->physical_map; + /* Currently support SINGLE profile only */ + ASSERT(map->num_stripes == 1); + + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (device->zone_info->max_active_zones == 0) + return 0; + + spin_lock(&block_group->lock); + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + return 0; + } + + /* Check if we have unwritten allocated space */ + if ((block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && + block_group->alloc_offset > block_group->meta_write_pointer) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } + spin_unlock(&block_group->lock); + + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } + + block_group->zone_is_active = 0; + block_group->alloc_offset = block_group->zone_capacity; + block_group->free_space_ctl->free_space = 0; + btrfs_clear_treelog_bg(block_group); + spin_unlock(&block_group->lock); + + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); + btrfs_dec_block_group_ro(block_group); + + if (!ret) { + btrfs_dev_clear_active_zone(device, physical); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + } + + return ret; +} + +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index) +{ + struct btrfs_device *device; + bool ret = false; + + if (!btrfs_is_zoned(fs_devices->fs_info)) + return true; + + /* Non-single profiles are not supported yet */ + if (raid_index != BTRFS_RAID_SINGLE) + return false; + + /* Check if there is a device with active zones left */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + + if (!device->bdev) + continue; + + if (!zinfo->max_active_zones || + atomic_read(&zinfo->active_zones_left)) { + ret = true; + break; + } + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +{ + struct btrfs_block_group *block_group; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + ASSERT(block_group); + + if (logical + length < block_group->start + block_group->zone_capacity) + goto out; + + spin_lock(&block_group->lock); + + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + goto out; + } + + block_group->zone_is_active = 0; + /* We should have consumed all the free space */ + ASSERT(block_group->alloc_offset == block_group->zone_capacity); + ASSERT(block_group->free_space_ctl->free_space == 0); + btrfs_clear_treelog_bg(block_group); + spin_unlock(&block_group->lock); + + map = block_group->physical_map; + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (!device->zone_info->max_active_zones) + goto out; + + btrfs_dev_clear_active_zone(device, physical); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + btrfs_put_block_group(block_group); + +out: + btrfs_put_block_group(block_group); +} + +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg == bg->start) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 4b299705bb12..e53ab7b96437 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -23,8 +23,11 @@ struct btrfs_zoned_device_info { u64 zone_size; u8 zone_size_shift; u32 nr_zones; + unsigned int max_active_zones; + atomic_t active_zones_left; unsigned long *seq_zones; unsigned long *empty_zones; + unsigned long *active_zones; struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; @@ -40,7 +43,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret); -void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror); int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, u64 hole_end, u64 num_bytes); @@ -66,6 +69,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +bool btrfs_zone_activate(struct btrfs_block_group *block_group); +int btrfs_zone_finish(struct btrfs_block_group *block_group); +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + int raid_index); +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -113,8 +123,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror, return 0; } -static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) -{ } +static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ + return 0; +} static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { @@ -199,6 +211,27 @@ static inline struct btrfs_device *btrfs_zoned_get_device( return ERR_PTR(-EOPNOTSUPP); } +static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + return true; +} + +static inline int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + return 0; +} + +static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + int raid_index) +{ + return true; +} + +static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 56dce9f00988..f06b68040352 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; + kunmap(in_page); put_page(in_page); start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, out: *out_pages = nr_pages; /* Cleanup */ - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } + if (out_page) + kunmap(out_page); return ret; } @@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - page_in_index++; + kunmap(pages_in[page_in_index++]); if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; zero_fill_bio(cb->orig_bio); done: + if (workspace->in_buf.src) + kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index c615387aedca..46bc589b7a03 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) { sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); + loff_t sz = bdev_nr_bytes(bdev); if (sz) { unsigned int sizebits = blksize_bits(size); @@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); + sector_t end_block = blkdev_max_block(bdev, size); do { if (!buffer_mapped(bh)) { diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index fac2e8e7b533..effe37ef8629 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) /* * Handle completion of a read from the cache. */ -static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2) +static void cachefiles_read_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); - _enter("%ld,%ld", ret, ret2); + _enter("%ld", ret); if (ki->term_func) { if (ret >= 0) @@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, fallthrough; default: ki->was_async = false; - cachefiles_read_complete(&ki->iocb, ret, 0); + cachefiles_read_complete(&ki->iocb, ret); if (ret > 0) ret = 0; break; @@ -159,12 +159,12 @@ presubmission_error: /* * Handle completion of a write to the cache. */ -static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2) +static void cachefiles_write_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); struct inode *inode = file_inode(ki->iocb.ki_filp); - _enter("%ld,%ld", ret, ret2); + _enter("%ld", ret); /* Tell lockdep we inherited freeze protection from submission thread */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); @@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, fallthrough; default: ki->was_async = false; - cachefiles_write_complete(&ki->iocb, ret, 0); + cachefiles_write_complete(&ki->iocb, ret); if (ret > 0) ret = 0; break; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 8ffc40e84a59..fcf4f3b72923 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, struct cachefiles_object *object; struct fscache_retrieval *op = monitor->op; struct wait_page_key *key = _key; - struct page *page = wait->private; + struct folio *folio = wait->private; ASSERT(key); _enter("{%lu},%u,%d,{%p,%u}", monitor->netfs_page->index, mode, sync, - key->page, key->bit_nr); + key->folio, key->bit_nr); - if (key->page != page || key->bit_nr != PG_locked) + if (key->folio != folio || key->bit_nr != PG_locked) return 0; - _debug("--- monitor %p %lx ---", page, page->flags); + _debug("--- monitor %p %lx ---", folio, folio->flags); - if (!PageUptodate(page) && !PageError(page)) { + if (!folio_test_uptodate(folio) && !folio_test_error(folio)) { /* unlocked, not uptodate and not erronous? */ _debug("page probably truncated"); } @@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, put_page(backpage2); INIT_LIST_HEAD(&monitor->op_link); - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); if (trylock_page(backpage)) { ret = -EIO; @@ -294,7 +294,7 @@ monitor_backing_page: get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was installed, so @@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3e42d0466521..8f537f1d9d1d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2330,7 +2330,6 @@ retry: int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; @@ -2365,14 +2364,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (err < 0) ret = err; - if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { - spin_lock(&file->f_lock); - err = errseq_check_and_advance(&ci->i_meta_err, - &fi->meta_err); - spin_unlock(&file->f_lock); - if (err < 0) - ret = err; - } + err = file_check_and_advance_wb_err(file); + if (err < 0) + ret = err; out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d16fd2d5fd42..b129ea551378 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -233,7 +233,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); - fi->meta_err = errseq_sample(&ci->i_meta_err); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; @@ -1023,7 +1022,7 @@ static void ceph_aio_complete(struct inode *inode, ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : CEPH_CAP_FILE_RD)); - aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + aio_req->iocb->ki_complete(aio_req->iocb, ret); ceph_free_cap_flush(aio_req->prealloc_cf); kfree(aio_req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2df1e1284451..1c7574105478 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -541,8 +541,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); - ci->i_meta_err = 0; - return &ci->vfs_inode; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index bdeb271f47d9..d8c31069fbf2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - /* No mandatory locks */ - if (fl->fl_type & LOCK_MAND) - return -EOPNOTSUPP; dout("ceph_flock, fl_file: %p\n", fl->fl_file); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7cad180d6deb..d64413adc0fd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1493,7 +1493,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; - struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1502,16 +1501,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); - if (req->r_target_inode) { - /* dropping unsafe change of inode's attributes */ - ci = ceph_inode(req->r_target_inode); - errseq_set(&ci->i_meta_err, -EIO); - } - if (req->r_unsafe_dir) { - /* dropping unsafe directory operation */ - ci = ceph_inode(req->r_unsafe_dir); - errseq_set(&ci->i_meta_err, -EIO); - } + if (req->r_target_inode) + mapping_set_error(req->r_target_inode->i_mapping, -EIO); + if (req->r_unsafe_dir) + mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1678,7 +1671,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { - errseq_set(&ci->i_meta_err, -EIO); + mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9b1b7f4cfdd4..fd8742bae847 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1002,16 +1002,16 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; - struct ceph_fs_client *other = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); dout("ceph_compare_super %p\n", sb); - if (compare_mount_options(fsopt, opt, other)) { + if (compare_mount_options(fsopt, opt, fsc)) { dout("monitor(s)/mount options don't match\n"); return 0; } if ((opt->flags & CEPH_OPT_FSID) && - ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) { dout("fsid doesn't match\n"); return 0; } @@ -1019,6 +1019,17 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) dout("flags differ\n"); return 0; } + + if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) { + dout("client is blocklisted (and CLEANRECOVER is not set)\n"); + return 0; + } + + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + dout("client has been forcibly unmounted\n"); + return 0; + } + return 1; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a40eb14c282a..14f951cd5b61 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -429,8 +429,6 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; #endif - errseq_t i_meta_err; - struct inode vfs_inode; /* at end */ }; @@ -774,7 +772,6 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; - errseq_t meta_err; u32 filp_gen; atomic_t num_locks; }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 13f3182cf796..1b855fcb179e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3184,7 +3184,7 @@ restart_loop: mutex_unlock(&ctx->aio_mutex); if (ctx->iocb && ctx->iocb->ki_complete) - ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + ctx->iocb->ki_complete(ctx->iocb, ctx->rc); else complete(&ctx->done); } @@ -3917,7 +3917,7 @@ again: mutex_unlock(&ctx->aio_mutex); if (ctx->iocb && ctx->iocb->ki_complete) - ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + ctx->iocb->ki_complete(ctx->iocb, ctx->rc); else complete(&ctx->done); } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 2be65269a987..666aa380011e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, return read_buffers[i] + blk_offset; } - devsize = mapping->host->i_size >> PAGE_SHIFT; + devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ for (i = 0; i < BLKS_PER_BUF; i++) { diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 68a2de6b5a9b..bfc2a5b74ed3 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -1,23 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This contains encryption functions for per-file encryption. + * Utility functions for file contents encryption/decryption on + * block device-based filesystems. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Add fscrypt_pullback_bio_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. */ #include <linux/pagemap.h> @@ -26,6 +13,21 @@ #include <linux/namei.h> #include "fscrypt_private.h" +/** + * fscrypt_decrypt_bio() - decrypt the contents of a bio + * @bio: the bio to decrypt + * + * Decrypt the contents of a "read" bio following successful completion of the + * underlying disk read. The bio must be reading a whole number of blocks of an + * encrypted file directly into the page cache. If the bio is reading the + * ciphertext into bounce pages instead of the page cache (for example, because + * the file is also compressed, so decompression is required after decryption), + * then this function isn't applicable. This function may sleep, so it must be + * called from a workqueue rather than from the bio's bi_end_io callback. + * + * This function sets PG_error on any pages that contain any blocks that failed + * to be decrypted. The filesystem must not mark such pages uptodate. + */ void fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index eb538c28df94..a9be4bc74a94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (fscrypt_has_encryption_key(dir)) { if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, - dir->i_sb->s_cop->max_namelen, + iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 3fa965eb3336..5b0a9e6478b5 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -20,6 +20,11 @@ #define FSCRYPT_FILE_NONCE_SIZE 16 +/* + * Minimum size of an fscrypt master key. Note: a longer key will be required + * if ciphers with a 256-bit security strength are used. This is just the + * absolute minimum, which applies when only 128-bit encryption is used. + */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 @@ -413,7 +418,11 @@ struct fscrypt_master_key_secret { */ struct fscrypt_hkdf hkdf; - /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + /* + * Size of the raw key in bytes. This remains set even if ->raw was + * zeroized due to no longer being needed. I.e. we still remember the + * size of the key even if we don't need to remember the key itself. + */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ @@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void); struct fscrypt_mode { const char *friendly_name; const char *cipher_str; - int keysize; - int ivsize; + int keysize; /* key size in bytes */ + int security_strength; /* security strength in bytes */ + int ivsize; /* IV size in bytes */ int logged_impl_name; enum blk_crypto_mode_num blk_crypto_mode; }; diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index e0ec21055505..7607d18b35fc 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -16,9 +16,14 @@ /* * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses - * SHA-512 because it is reasonably secure and efficient; and since it produces - * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of - * entropy from the master key and requires only one iteration of HKDF-Expand. + * SHA-512 because it is well-established, secure, and reasonably efficient. + * + * HKDF-SHA256 was also considered, as its 256-bit security strength would be + * sufficient here. A 512-bit security strength is "nice to have", though. + * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the + * common case of deriving an AES-256-XTS key (512 bits), that can result in + * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of + * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ #define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index bca9c6658a7c..eede186b04ce 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, @@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-256-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 32, + .security_strength = 32, .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { .friendly_name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, @@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-128-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, @@ -117,8 +122,9 @@ err_free_tfm: /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the - * raw key, encryption mode, and flag indicating which encryption implementation - * (fs-layer or blk-crypto) will be used. + * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption + * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), + * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci) @@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, } /* + * Check whether the size of the given master key (@mk) is appropriate for the + * encryption settings which a particular file will use (@ci). + * + * If the file uses a v1 encryption policy, then the master key must be at least + * as long as the derived key, as this is a requirement of the v1 KDF. + * + * Otherwise, the KDF can accept any size key, so we enforce a slightly looser + * requirement: we require that the size of the master key be at least the + * maximum security strength of any algorithm whose key will be derived from it + * (but in practice we only need to consider @ci->ci_mode, since any other + * possible subkeys such as DIRHASH and INODE_HASH will never increase the + * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be + * derived from a 256-bit master key, which is cryptographically sufficient, + * rather than requiring a 512-bit master key which is unnecessarily long. (We + * still allow 512-bit master keys if the user chooses to use them, though.) + */ +static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, + const struct fscrypt_info *ci) +{ + unsigned int min_keysize; + + if (ci->ci_policy.version == FSCRYPT_POLICY_V1) + min_keysize = ci->ci_mode->keysize; + else + min_keysize = ci->ci_mode->security_strength; + + if (mk->mk_secret.size < min_keysize) { + fscrypt_warn(NULL, + "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", + master_key_spec_type(&mk->mk_spec), + master_key_spec_len(&mk->mk_spec), + (u8 *)&mk->mk_spec.u, + mk->mk_secret.size, min_keysize); + return false; + } + return true; +} + +/* * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then the @@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, goto out_release_key; } - /* - * Require that the master key be at least as long as the derived key. - * Otherwise, the derived key cannot possibly contain as much entropy as - * that required by the encryption mode it will be used for. For v1 - * policies it's also required for the KDF to work at all. - */ - if (mk->mk_secret.size < ci->ci_mode->keysize) { - fscrypt_warn(NULL, - "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", - master_key_spec_type(&mk_spec), - master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u, - mk->mk_secret.size, ci->ci_mode->keysize); + if (!fscrypt_valid_master_key_size(mk, ci)) { err = -ENOKEY; goto out_release_key; } diff --git a/fs/direct-io.c b/fs/direct-io.c index b2e86e739d7a..654443558047 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -119,7 +119,6 @@ struct dio { int flags; /* doesn't change */ int op; int op_flags; - blk_qc_t bio_cookie; struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ @@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) if (ret > 0 && dio->op == REQ_OP_WRITE) ret = generic_write_sync(dio->iocb, ret); - dio->iocb->ki_complete(dio->iocb, ret, 0); + dio->iocb->ki_complete(dio->iocb, ret); } kmem_cache_free(dio_cache, dio); @@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->bio_disk = bio->bi_bdev->bd_disk; - if (sdio->submit_io) { + if (sdio->submit_io) sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); - dio->bio_cookie = BLK_QC_T_NONE; - } else - dio->bio_cookie = submit_bio(bio); + else + submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio) __set_current_state(TASK_UNINTERRUPTIBLE); dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) - blk_io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; @@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else { dio->op = REQ_OP_READ; } - if (iocb->ki_flags & IOCB_HIPRI) - dio->op_flags |= REQ_HIPRI; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 14b747026742..f57255ab88ed 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -6,16 +6,22 @@ config EROFS_FS select FS_IOMAP select LIBCRC32C help - EROFS (Enhanced Read-Only File System) is a lightweight - read-only file system with modern designs (eg. page-sized - blocks, inline xattrs/data, etc.) for scenarios which need - high-performance read-only requirements, e.g. Android OS - for mobile phones and LIVECDs. + EROFS (Enhanced Read-Only File System) is a lightweight read-only + file system with modern designs (e.g. no buffer heads, inline + xattrs/data, chunk-based deduplication, multiple devices, etc.) for + scenarios which need high-performance read-only solutions, e.g. + smartphones with Android OS, LiveCDs and high-density hosts with + numerous containers; - It also provides fixed-sized output compression support, - which improves storage density, keeps relatively higher - compression ratios, which is more useful to achieve high - performance for embedded devices with limited memory. + It also provides fixed-sized output compression support in order to + improve storage density as well as keep relatively higher compression + ratios and implements in-place decompression to reuse the file page + for compressed data temporarily with proper strategies, which is + quite useful to ensure guaranteed end-to-end runtime decompression + performance under extremely memory pressure without extra cost. + + See the documentation at <file:Documentation/filesystems/erofs.rst> + for more details. If unsure, say N. @@ -76,3 +82,19 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. + +config EROFS_FS_ZIP_LZMA + bool "EROFS LZMA compressed data support" + depends on EROFS_FS_ZIP + select XZ_DEC + select XZ_DEC_MICROLZMA + help + Saying Y here includes support for reading EROFS file systems + containing LZMA compressed data, specifically called microLZMA. it + gives better compression ratios than the LZ4 algorithm, at the + expense of more CPU overhead. + + LZMA support is an experimental feature for now and so most file + systems will be readable without selecting this option. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 1f9aced49070..756fe2d65272 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o +erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 3701c72bacb2..579406504919 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -8,11 +8,6 @@ #include "internal.h" -enum { - Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, - Z_EROFS_COMPRESSION_RUNTIME_MAX -}; - struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; @@ -25,6 +20,12 @@ struct z_erofs_decompress_req { bool inplace_io, partial_decoding; }; +struct z_erofs_decompressor { + int (*decompress)(struct z_erofs_decompress_req *rq, + struct page **pagepool); + char *name; +}; + /* some special page->private (unsigned long, see below) */ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) #define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) @@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page) return true; } -static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, +static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, struct page *page) { if (!z_erofs_is_shortlived_page(page)) @@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, put_page(page); } else { /* follow the pcluster rule above. */ - set_page_private(page, 0); - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); } return true; } +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} + int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); + struct page **pagepool); +/* prototypes for specific algorithms */ +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 9db829715652..808234d9190c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode, erofs_off_t pos; int err = 0; + map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; @@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode, map->m_flags = 0; break; default: - /* only one device is supported for now */ - if (idx->device_id) { - erofs_err(sb, "invalid device id %u @ %llu for nid %llu", - le16_to_cpu(idx->device_id), - chunknr, vi->nid); - err = -EFSCORRUPTED; - goto out_unlock; - } + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; @@ -155,11 +150,55 @@ out: return err; } +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) +{ + struct erofs_dev_context *devs = EROFS_SB(sb)->devs; + struct erofs_device_info *dif; + int id; + + /* primary device by default */ + map->m_bdev = sb->s_bdev; + map->m_daxdev = EROFS_SB(sb)->dax_dev; + + if (map->m_deviceid) { + down_read(&devs->rwsem); + dif = idr_find(&devs->tree, map->m_deviceid - 1); + if (!dif) { + up_read(&devs->rwsem); + return -ENODEV; + } + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + up_read(&devs->rwsem); + } else if (devs->extra_devices) { + down_read(&devs->rwsem); + idr_for_each_entry(&devs->tree, dif, id) { + erofs_off_t startoff, length; + + if (!dif->mapped_blkaddr) + continue; + startoff = blknr_to_addr(dif->mapped_blkaddr); + length = blknr_to_addr(dif->blocks); + + if (map->m_pa >= startoff && + map->m_pa < startoff + length) { + map->m_pa -= startoff; + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + break; + } + } + up_read(&devs->rwsem); + } + return 0; +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct erofs_map_blocks map; + struct erofs_map_dev mdev; map.m_la = offset; map.m_llen = length; @@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(inode->i_sb, &mdev); + if (ret) + return ret; + + iomap->bdev = mdev.m_bdev; + iomap->dax_dev = mdev.m_daxdev; iomap->offset = map.m_la; iomap->length = map.m_llen; iomap->flags = 0; @@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_INLINE; ipage = erofs_get_meta_page(inode->i_sb, - erofs_blknr(map.m_pa)); + erofs_blknr(mdev.m_pa)); if (IS_ERR(ipage)) return PTR_ERR(ipage); iomap->inline_data = page_address(ipage) + - erofs_blkoff(map.m_pa); + erofs_blkoff(mdev.m_pa); iomap->private = ipage; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = mdev.m_pa; } return 0; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index a5bc4b1b7813..bf37fc76b182 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -16,17 +16,6 @@ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif -struct z_erofs_decompressor { - /* - * if destpages have sparsed pages, fill them with bounce pages. - * it also check whether destpages indicate continuous physical memory. - */ - int (*prepare_destpages)(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); - int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); - char *name; -}; - int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int size) @@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb, return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); } -static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +/* + * Fill all gaps with bounce pages if it's a sparse page list. Also check if + * all physical pages are consecutive, which can be seen for moderate CR. + */ +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nr = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq, +static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, void *inpage, unsigned int *inputmargin, int *maptype, bool support_0padding) { @@ -189,7 +182,8 @@ docopy: return src; } -static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, + u8 *out) { unsigned int inputmargin; u8 *headpage, *src; @@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) } rq->inputsize -= inputmargin; - src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype, - support_0padding); + src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin, + &maptype, support_0padding); if (IS_ERR(src)) return PTR_ERR(src); @@ -233,7 +227,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", ret, rq->inputsize, inputmargin, rq->outputsize); - WARN_ON(1); print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, 16, 1, src + inputmargin, rq->inputsize, true); print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, @@ -242,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); ret = -EIO; + } else { + ret = 0; } if (maptype == 0) { @@ -257,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) return ret; } -static struct z_erofs_decompressor decompressors[] = { - [Z_EROFS_COMPRESSION_SHIFTED] = { - .name = "shifted" - }, - [Z_EROFS_COMPRESSION_LZ4] = { - .prepare_destpages = z_erofs_lz4_prepare_destpages, - .decompress = z_erofs_lz4_decompress, - .name = "lz4" - }, -}; - -static void copy_from_pcpubuf(struct page **out, const char *dst, - unsigned short pageofs_out, - unsigned int outputsize) -{ - const char *end = dst + outputsize; - const unsigned int righthalf = PAGE_SIZE - pageofs_out; - const char *cur = dst - pageofs_out; - - while (cur < end) { - struct page *const page = *out++; - - if (page) { - char *buf = kmap_atomic(page); - - if (cur >= dst) { - memcpy(buf, cur, min_t(uint, PAGE_SIZE, - end - cur)); - } else { - memcpy(buf + pageofs_out, cur + pageofs_out, - min_t(uint, righthalf, end - cur)); - } - kunmap_atomic(buf); - } - cur += PAGE_SIZE; - } -} - -static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const struct z_erofs_decompressor *alg = decompressors + rq->alg; unsigned int dst_maptype; void *dst; int ret; - /* two optimized fast paths only for non bigpcluster cases yet */ - if (rq->inputsize <= PAGE_SIZE) { - if (nrpages_out == 1 && !rq->inplace_io) { - DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); - dst_maptype = 0; - goto dstmap_out; - } - - /* - * For the case of small output size (especially much less - * than PAGE_SIZE), memcpy the decompressed data rather than - * compressed data is preferred. - */ - if (rq->outputsize <= PAGE_SIZE * 7 / 8) { - dst = erofs_get_pcpubuf(1); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - rq->inplace_io = false; - ret = alg->decompress(rq, dst); - if (!ret) - copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, - rq->outputsize); - - erofs_put_pcpubuf(dst); - return ret; - } + /* one optimized fast path only for non bigpcluster cases yet */ + if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; } /* general decoding path which can be used for all cases */ - ret = alg->prepare_destpages(rq, pagepool); + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); if (ret < 0) return ret; if (ret) { @@ -351,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, dst_maptype = 2; dstmap_out: - ret = alg->decompress(rq, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out); if (!dst_maptype) kunmap_atomic(dst); @@ -360,8 +294,8 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -399,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, return 0; } +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .decompress = z_erofs_shifted_transform, + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +#ifdef CONFIG_EROFS_FS_ZIP_LZMA + [Z_EROFS_COMPRESSION_LZMA] = { + .decompress = z_erofs_lzma_decompress, + .name = "lzma" + }, +#endif +}; + int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { - if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) - return z_erofs_shifted_transform(rq, pagepool); - return z_erofs_decompress_generic(rq, pagepool); + return decompressors[rq->alg].decompress(rq, pagepool); } diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c new file mode 100644 index 000000000000..50045510a1f4 --- /dev/null +++ b/fs/erofs/decompressor_lzma.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/xz.h> +#include <linux/module.h> +#include "compress.h" + +struct z_erofs_lzma { + struct z_erofs_lzma *next; + struct xz_dec_microlzma *state; + struct xz_buf buf; + u8 bounce[PAGE_SIZE]; +}; + +/* considering the LZMA performance, no need to use a lockless list for now */ +static DEFINE_SPINLOCK(z_erofs_lzma_lock); +static unsigned int z_erofs_lzma_max_dictsize; +static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms; +static struct z_erofs_lzma *z_erofs_lzma_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); + +module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); + +void z_erofs_lzma_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_lzma_avail_strms) { + struct z_erofs_lzma *strm; + + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + DBG_BUGON(1); + return; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + while (strm) { + struct z_erofs_lzma *n = strm->next; + + if (strm->state) + xz_dec_microlzma_end(strm->state); + kfree(strm); + --z_erofs_lzma_avail_strms; + strm = n; + } + } +} + +int z_erofs_lzma_init(void) +{ + unsigned int i; + + /* by default, use # of possible CPUs instead */ + if (!z_erofs_lzma_nstrms) + z_erofs_lzma_nstrms = num_possible_cpus(); + + for (i = 0; i < z_erofs_lzma_nstrms; ++i) { + struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL); + + if (!strm) { + z_erofs_lzma_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + ++z_erofs_lzma_avail_strms; + } + return 0; +} + +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) +{ + static DEFINE_MUTEX(lzma_resize_mutex); + unsigned int dict_size, i; + struct z_erofs_lzma *strm, *head = NULL; + int err; + + if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) { + erofs_err(sb, "invalid lzma cfgs, size=%u", size); + return -EINVAL; + } + if (lzma->format) { + erofs_err(sb, "unidentified lzma format %x, please check kernel version", + le16_to_cpu(lzma->format)); + return -EINVAL; + } + dict_size = le32_to_cpu(lzma->dict_size); + if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) { + erofs_err(sb, "unsupported lzma dictionary size %u", + dict_size); + return -EINVAL; + } + + erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); + + /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ + mutex_lock(&lzma_resize_mutex); + + if (z_erofs_lzma_max_dictsize >= dict_size) { + mutex_unlock(&lzma_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + for (i = 0; i < z_erofs_lzma_avail_strms; ++i) { + struct z_erofs_lzma *last; + +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, + READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + for (last = strm; last->next; last = last->next) + ++i; + last->next = head; + head = strm; + } + + err = 0; + /* 2. walk each isolated stream and grow max dict_size if needed */ + for (strm = head; strm; strm = strm->next) { + if (strm->state) + xz_dec_microlzma_end(strm->state); + strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size); + if (!strm->state) + err = -ENOMEM; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_lzma_lock); + DBG_BUGON(z_erofs_lzma_head); + z_erofs_lzma_head = head; + spin_unlock(&z_erofs_lzma_lock); + + z_erofs_lzma_max_dictsize = dict_size; + mutex_unlock(&lzma_resize_mutex); + return err; +} + +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + unsigned int inputmargin, inlen, outlen, pageofs; + struct z_erofs_lzma *strm; + u8 *kin; + bool bounced = false; + int no, ni, j, err = 0; + + /* 1. get the exact LZMA compressed size */ + kin = kmap(*rq->in); + inputmargin = 0; + while (!kin[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= PAGE_SIZE) { + kunmap(*rq->in); + return -EFSCORRUPTED; + } + rq->inputsize -= inputmargin; + + /* 2. get an available lzma context */ +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = strm->next; + spin_unlock(&z_erofs_lzma_lock); + + /* 3. multi-call decompress */ + inlen = rq->inputsize; + outlen = rq->outputsize; + xz_dec_microlzma_reset(strm->state, inlen, outlen, + !rq->partial_decoding); + pageofs = rq->pageofs_out; + strm->buf.in = kin + inputmargin; + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin); + inlen -= strm->buf.in_size; + strm->buf.out = NULL; + strm->buf.out_pos = 0; + strm->buf.out_size = 0; + + for (ni = 0, no = -1;;) { + enum xz_ret xz_err; + + if (strm->buf.out_pos == strm->buf.out_size) { + if (strm->buf.out) { + kunmap(rq->out[no]); + strm->buf.out = NULL; + } + + if (++no >= nrpages_out || !outlen) { + erofs_err(rq->sb, "decompressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.out_pos = 0; + strm->buf.out_size = min_t(u32, outlen, + PAGE_SIZE - pageofs); + outlen -= strm->buf.out_size; + if (rq->out[no]) + strm->buf.out = kmap(rq->out[no]) + pageofs; + pageofs = 0; + } else if (strm->buf.in_pos == strm->buf.in_size) { + kunmap(rq->in[ni]); + + if (++ni >= nrpages_in || !inlen) { + erofs_err(rq->sb, "compressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); + inlen -= strm->buf.in_size; + kin = kmap(rq->in[ni]); + strm->buf.in = kin; + bounced = false; + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Otherwise, Use short-lived pages + * from the on-stack pagepool where pages share with the same + * request. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); + strm->buf.in = strm->bounce; + bounced = true; + } + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); + DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); + DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + + if (xz_err != XZ_OK) { + if (xz_err == XZ_STREAM_END && !outlen) + break; + erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", + xz_err, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + if (no < nrpages_out && strm->buf.out) + kunmap(rq->in[no]); + if (ni < nrpages_in) + kunmap(rq->in[ni]); + /* 4. push back LZMA stream context to the global list */ + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + wake_up(&z_erofs_lzma_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b0b23f41abc3..083997a034e5 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -21,14 +21,29 @@ #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 +#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 +#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ + EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2) #define EROFS_SB_EXTSLOT_SIZE 16 +struct erofs_deviceslot { + union { + u8 uuid[16]; /* used for device manager later */ + u8 userdata[64]; /* digest(sha256), etc. */ + } u; + __le32 blocks; /* total fs blocks of this device */ + __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ + u8 reserved[56]; +}; +#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) + /* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ @@ -54,7 +69,9 @@ struct erofs_super_block { /* customized sliding window size instead of 64k by default */ __le16 lz4_max_distance; } __packed u1; - __u8 reserved2[42]; + __le16 extra_devices; /* # of devices besides the primary device */ + __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ + __u8 reserved2[38]; }; /* @@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 8-byte inode chunk indexes */ struct erofs_inode_chunk_index { __le16 advise; /* always 0, don't care for now */ - __le16 device_id; /* back-end storage id, always 0 for now */ + __le16 device_id; /* back-end storage id (with bits masked) */ __le32 blkaddr; /* start block address of this inode chunk */ }; @@ -247,10 +264,11 @@ struct erofs_inode_chunk_index { /* available compression algorithm types (for h_algorithmtype) */ enum { - Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZMA = 1, Z_EROFS_COMPRESSION_MAX }; -#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1)) +#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) /* 14 bytes (+ length field = 16 bytes) */ struct z_erofs_lz4_cfgs { @@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs { u8 reserved[10]; } __packed; +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lzma_cfgs { + __le32 dict_size; + __le16 format; + u8 reserved[8]; +} __packed; + +#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; @@ -288,35 +315,34 @@ struct z_erofs_map_header { #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 /* - * Fixed-sized output compression ondisk Logical Extent cluster type: - * 0 - literal (uncompressed) cluster - * 1 - compressed cluster (for the head logical cluster) - * 2 - compressed cluster (for the other logical clusters) + * Fixed-sized output compression on-disk logical cluster type: + * 0 - literal (uncompressed) lcluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * * In detail, - * 0 - literal (uncompressed) cluster, + * 0 - literal (uncompressed) lcluster, * di_advise = 0 - * di_clusterofs = the literal data offset of the cluster - * di_blkaddr = the blkaddr of the literal cluster + * di_clusterofs = the literal data offset of the lcluster + * di_blkaddr = the blkaddr of the literal pcluster * - * 1 - compressed cluster (for the head logical cluster) - * di_advise = 1 - * di_clusterofs = the decompressed data offset of the cluster - * di_blkaddr = the blkaddr of the compressed cluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * di_advise = 1 or 3 + * di_clusterofs = the decompressed data offset of the lcluster + * di_blkaddr = the blkaddr of the compressed pcluster * - * 2 - compressed cluster (for the other logical clusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * di_advise = 2 * di_clusterofs = - * the decompressed data offset in its own head cluster - * di_u.delta[0] = distance to its corresponding head cluster - * di_u.delta[1] = distance to its corresponding tail cluster - * (di_advise could be 0, 1 or 2) + * the decompressed data offset in its own HEAD lcluster + * di_u.delta[0] = distance to this HEAD lcluster + * di_u.delta[1] = distance to the next HEAD lcluster */ enum { Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, Z_EROFS_VLE_CLUSTER_TYPE_MAX }; @@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void) /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != sizeof(struct z_erofs_vle_decompressed_index)); + BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index a552399e211d..2345f1de438e 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_flags &= ~S_DAX; - if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && vi->datalayout == EROFS_INODE_FLAT_PLAIN) inode->i_flags |= S_DAX; if (!nblks) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9524e155b38f..3265688af7f9 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,7 +47,16 @@ typedef u64 erofs_off_t; /* data type for filesystem-wide blocks number */ typedef u32 erofs_blk_t; -struct erofs_fs_context { +struct erofs_device_info { + char *path; + struct block_device *bdev; + struct dax_device *dax_dev; + + u32 blocks; + u32 mapped_blkaddr; +}; + +struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; @@ -60,6 +69,18 @@ struct erofs_fs_context { unsigned int mount_opt; }; +struct erofs_dev_context { + struct idr tree; + struct rw_semaphore rwsem; + + unsigned int extra_devices; +}; + +struct erofs_fs_context { + struct erofs_mount_opts opt; + struct erofs_dev_context *devs; +}; + /* all filesystem-wide lz4 configurations */ struct erofs_sb_lz4_info { /* # of pages needed for EROFS lz4 rolling decompression */ @@ -69,6 +90,7 @@ struct erofs_sb_lz4_info { }; struct erofs_sb_info { + struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ struct list_head list; @@ -85,12 +107,16 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct erofs_dev_context *devs; struct dax_device *dax_dev; - u32 blocks; + u64 total_blocks; + u32 primarydevice_blocks; + u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; #endif + u16 device_id_mask; /* valid bits of device id to be used */ /* inode slot unit size in bit shift */ unsigned char islotbits; @@ -108,8 +134,6 @@ struct erofs_sb_info { u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; - - struct erofs_fs_context ctx; /* options */ }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -121,9 +145,9 @@ struct erofs_sb_info { #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 -#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) -#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) -#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option) +#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) enum { EROFS_ZIP_CACHE_DISABLED, @@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value) EROFS_I_DATALAYOUT_BITS); } +/* + * Different from grab_cache_page_nowait(), reclaiming is never triggered + * when allocating new pages. + */ +static inline +struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); +} + extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; @@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops; * of the corresponding uncompressed data in the file. */ enum { - BH_Zipped = BH_PrivateStart, + BH_Encoded = BH_PrivateStart, BH_FullMapped, }; @@ -346,8 +384,8 @@ enum { #define EROFS_MAP_MAPPED (1 << BH_Mapped) /* Located in metadata (could be copied from bd_inode) */ #define EROFS_MAP_META (1 << BH_Meta) -/* The extent has been compressed */ -#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The extent is encoded */ +#define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) @@ -355,6 +393,8 @@ struct erofs_map_blocks { erofs_off_t m_pa, m_la; u64 m_plen, m_llen; + unsigned short m_deviceid; + char m_algorithmformat; unsigned int m_flags; struct page *mpage; @@ -367,6 +407,13 @@ struct erofs_map_blocks { * approach instead if possible since it's more metadata lightweight.) */ #define EROFS_GET_BLOCKS_FIEMAP 0x0002 +/* Used to map the whole extent if non-negligible data is requested for LZMA */ +#define EROFS_GET_BLOCKS_READMORE 0x0004 + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; /* zmap.c */ extern const struct iomap_ops z_erofs_iomap_report_ops; @@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, } #endif /* !CONFIG_EROFS_FS_ZIP */ +struct erofs_map_dev { + struct block_device *m_bdev; + struct dax_device *m_daxdev; + + erofs_off_t m_pa; + unsigned int m_deviceid; +}; + /* data.c */ extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); /* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +static inline void erofs_pagepool_add(struct page **pagepool, + struct page *page) +{ + set_page_private(page, (unsigned long)*pagepool); + *pagepool = page; +} +void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); @@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_ZIP_LZMA +int z_erofs_lzma_init(void); +void z_erofs_lzma_exit(void); +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size); +#else +static inline int z_erofs_lzma_init(void) { return 0; } +static inline int z_erofs_lzma_exit(void) { return 0; } +static inline int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) { + if (lzma) { + erofs_err(sb, "lzma algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c index 6c885575128a..a2efd833d1b6 100644 --- a/fs/erofs/pcpubuf.c +++ b/fs/erofs/pcpubuf.c @@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) { static DEFINE_MUTEX(pcb_resize_mutex); static unsigned int pcb_nrpages; - LIST_HEAD(pagepool); + struct page *pagepool = NULL; int delta, cpu, ret, i; mutex_lock(&pcb_resize_mutex); @@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) vunmap(old_ptr); free_pagearray: while (i) - list_add(&oldpages[--i]->lru, &pagepool); + erofs_pagepool_add(&pagepool, oldpages[--i]); kfree(oldpages); if (ret) break; } pcb_nrpages = nrpages; - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); out: mutex_unlock(&pcb_resize_mutex); return ret; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 11b88559f8bf..6a969b1e0ee6 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZ4: ret = z_erofs_load_lz4_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_LZMA: + ret = z_erofs_load_lzma_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } #endif +static int erofs_init_devices(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int ondisk_extradevs; + erofs_off_t pos; + struct page *page = NULL; + struct erofs_device_info *dif; + struct erofs_deviceslot *dis; + void *ptr; + int id, err = 0; + + sbi->total_blocks = sbi->primarydevice_blocks; + if (!erofs_sb_has_device_table(sbi)) + ondisk_extradevs = 0; + else + ondisk_extradevs = le16_to_cpu(dsb->extra_devices); + + if (ondisk_extradevs != sbi->devs->extra_devices) { + erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", + ondisk_extradevs, sbi->devs->extra_devices); + return -EINVAL; + } + if (!ondisk_extradevs) + return 0; + + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; + pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; + down_read(&sbi->devs->rwsem); + idr_for_each_entry(&sbi->devs->tree, dif, id) { + erofs_blk_t blk = erofs_blknr(pos); + struct block_device *bdev; + + if (!page || page->index != blk) { + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) { + up_read(&sbi->devs->rwsem); + return PTR_ERR(page); + } + ptr = kmap(page); + } + dis = ptr + erofs_blkoff(pos); + + bdev = blkdev_get_by_path(dif->path, + FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto err_out; + } + dif->bdev = bdev; + dif->dax_dev = fs_dax_get_by_bdev(bdev); + dif->blocks = le32_to_cpu(dis->blocks); + dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + sbi->total_blocks += dif->blocks; + pos += EROFS_DEVT_SLOT_SIZE; + } +err_out: + up_read(&sbi->devs->rwsem); + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + return err; +} + static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->blocks = le32_to_cpu(dsb->blocks); + sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_load_compr_cfgs(sb, dsb); else ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); + if (ret < 0) + goto out; + + /* handle multiple devices */ + ret = erofs_init_devices(sb, dsb); out: kunmap(page); put_page(page); @@ -340,15 +421,15 @@ out: static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->max_sync_decompress_pages = 3; - ctx->readahead_sync_decompress = false; + ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + ctx->opt.max_sync_decompress_pages = 3; + ctx->opt.readahead_sync_decompress = false; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); #endif } @@ -358,6 +439,7 @@ enum { Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_err }; @@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), + fsparam_string("device", Opt_device), {} }; @@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(ctx, DAX_ALWAYS); - clear_opt(ctx, DAX_NEVER); + set_opt(&ctx->opt, DAX_ALWAYS); + clear_opt(&ctx->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(ctx, DAX_NEVER); - clear_opt(ctx, DAX_ALWAYS); + set_opt(&ctx->opt, DAX_NEVER); + clear_opt(&ctx->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx __maybe_unused = fc->fs_private; + struct erofs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + struct erofs_device_info *dif; + int opt, ret; opt = fs_parse(fc, erofs_fs_parameters, param, &result); if (opt < 0) @@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); else - clear_opt(ctx, XATTR_USER); + clear_opt(&ctx->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); else - clear_opt(ctx, POSIX_ACL); + clear_opt(&ctx->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = result.uint_32; + ctx->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc, if (!erofs_fc_set_dax_mode(fc, result.uint_32)) return -EINVAL; break; + case Opt_device: + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) + return -ENOMEM; + dif->path = kstrdup(param->string, GFP_KERNEL); + if (!dif->path) { + kfree(dif); + return -ENOMEM; + } + down_write(&ctx->devs->rwsem); + ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&ctx->devs->rwsem); + if (ret < 0) { + kfree(dif->path); + kfree(dif); + return ret; + } + ++ctx->devs->extra_devices; + break; default: return -ENOPARAM; } @@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->opt = ctx->opt; sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + sbi->devs = ctx->devs; + ctx->devs = NULL; + err = erofs_read_superblock(sb); if (err) return err; - if (test_opt(ctx, DAX_ALWAYS) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(ctx, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_ALWAYS); } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sb->s_xattr = erofs_xattr_handlers; - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; - #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); #endif @@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; + sbi->opt = ctx->opt; fc->sb_flags |= SB_RDONLY; return 0; } +static int erofs_release_device_info(int id, void *ptr, void *data) +{ + struct erofs_device_info *dif = ptr; + + fs_put_dax(dif->dax_dev); + if (dif->bdev) + blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + kfree(dif->path); + kfree(dif); + return 0; +} + +static void erofs_free_dev_context(struct erofs_dev_context *devs) +{ + if (!devs) + return; + idr_for_each(&devs->tree, &erofs_release_device_info, NULL); + idr_destroy(&devs->tree); + kfree(devs); +} + static void erofs_fc_free(struct fs_context *fc) { - kfree(fc->fs_private); + struct erofs_fs_context *ctx = fc->fs_private; + + erofs_free_dev_context(ctx->devs); + kfree(ctx); } static const struct fs_context_operations erofs_context_ops = { @@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = { static int erofs_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL); - if (!fc->fs_private) - return -ENOMEM; + struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - /* set default mount options */ - erofs_default_options(fc->fs_private); + if (!ctx) + return -ENOMEM; + ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!ctx->devs) { + kfree(ctx); + return -ENOMEM; + } + fc->fs_private = ctx; + idr_init(&ctx->devs->tree); + init_rwsem(&ctx->devs->rwsem); + erofs_default_options(ctx); fc->ops = &erofs_context_ops; - return 0; } @@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + + erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; @@ -706,6 +843,10 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; + err = z_erofs_lzma_init(); + if (err) + goto lzma_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -720,6 +861,8 @@ static int __init erofs_module_init(void) fs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_lzma_exit(); +lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); @@ -730,11 +873,13 @@ icache_err: static void __exit erofs_module_exit(void) { unregister_filesystem(&erofs_fs_type); - z_erofs_exit_zip_subsystem(); - erofs_exit_shrinker(); - /* Ensure all RCU free inodes are safe before cache is destroyed. */ + /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + + z_erofs_exit_zip_subsystem(); + z_erofs_lzma_exit(); + erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } @@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = EROFS_BLKSIZ; - buf->f_blocks = sbi->blocks; + buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; buf->f_files = ULLONG_MAX; @@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx = &sbi->ctx; + struct erofs_mount_opts *opt = &sbi->opt; #ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(ctx, XATTR_USER)) + if (test_opt(opt, XATTR_USER)) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(opt, POSIX_ACL)) seq_puts(seq, ",acl"); else seq_puts(seq, ",noacl"); #endif #ifdef CONFIG_EROFS_FS_ZIP - if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED) + if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) seq_puts(seq, ",cache_strategy=disabled"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) seq_puts(seq, ",cache_strategy=readahead"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif - if (test_opt(ctx, DAX_ALWAYS)) + if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); - if (test_opt(ctx, DAX_NEVER)) + if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); return 0; } diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index bd86067a63f7..84da2c280012 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -6,20 +6,29 @@ #include "internal.h" #include <linux/pagevec.h> -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { - struct page *page; + struct page *page = *pagepool; - if (!list_empty(pool)) { - page = lru_to_page(pool); + if (page) { DBG_BUGON(page_ref_count(page) != 1); - list_del(&page->lru); + *pagepool = (struct page *)page_private(page); } else { page = alloc_page(gfp); } return page; } +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + put_page(page); + } +} + #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 778f2c52295d..01c581e93c5f 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) static bool erofs_xattr_user_list(struct dentry *dentry) { - return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER); + return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); } static bool erofs_xattr_trusted_list(struct dentry *dentry) @@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, switch (handler->flags) { case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->ctx, XATTR_USER)) + if (!test_opt(&sbi->opt, XATTR_USER)) return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 11c7a1aaebad..bcb1b91b234f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -236,7 +236,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + struct page **pagepool) { struct z_erofs_pcluster *pcl = clt->pcl; bool standalone = true; @@ -287,12 +287,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) continue; - if (page) { + if (page) put_page(page); - } else if (newpage) { - set_page_private(newpage, 0); - list_add(&newpage->lru, pagepool); - } + else if (newpage) + erofs_pagepool_add(pagepool, newpage); } /* @@ -476,6 +474,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct erofs_workgroup *grp; int err; + if (!(map->m_flags & EROFS_MAP_ENCODED)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + /* no available pcluster, let's allocate one */ pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) @@ -483,16 +486,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; - + pcl->algorithmformat = map->m_algorithmformat; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | (map->m_flags & EROFS_MAP_FULL_MAPPED ? Z_EROFS_PCLUSTER_FULL_LENGTH : 0); - if (map->m_flags & EROFS_MAP_ZIPPED) - pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; - else - pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = clt->owned_head; clt->mode = COLLECT_PRIMARY_FOLLOWED; @@ -643,7 +641,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct list_head *pagepool) + struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -695,7 +693,7 @@ restart_now: goto err_out; /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la)) + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) cache_strategy = TRYALLOC; else cache_strategy = DONTALLOC; @@ -796,7 +794,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* Use workqueue and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); - sbi->ctx.readahead_sync_decompress = true; + sbi->opt.readahead_sync_decompress = true; return; } z_erofs_decompressqueue_work(&io->u.work); @@ -836,7 +834,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, - struct list_head *pagepool) + struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); struct z_erofs_pagevec_ctor ctor; @@ -1036,7 +1034,7 @@ out: } static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct list_head *pagepool) + struct page **pagepool) { z_erofs_next_pcluster_t owned = io->head; @@ -1060,18 +1058,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); - LIST_HEAD(pagepool); + struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); kvfree(bgq); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, - struct list_head *pagepool, + struct page **pagepool, struct address_space *mc, gfp_t gfp) { @@ -1173,7 +1171,7 @@ repeat: out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; } @@ -1257,7 +1255,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, + struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { @@ -1266,8 +1264,9 @@ static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; z_erofs_next_pcluster_t owned_head = f->clt.owned_head; - /* since bio will be NULL, no need to initialize last_index */ + /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; + struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; @@ -1279,6 +1278,7 @@ static void z_erofs_submit_queue(struct super_block *sb, q[JQ_SUBMIT]->head = owned_head; do { + struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; pgoff_t cur, end; unsigned int i = 0; @@ -1290,7 +1290,13 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - cur = pcl->obj.index; + /* no device id here, thus it will always succeed */ + mdev = (struct erofs_map_dev) { + .m_pa = blknr_to_addr(pcl->obj.index), + }; + (void)erofs_map_dev(sb, &mdev); + + cur = erofs_blknr(mdev.m_pa); end = cur + pcl->pclusterpages; /* close the main owned chain at first */ @@ -1306,7 +1312,8 @@ static void z_erofs_submit_queue(struct super_block *sb, if (!page) continue; - if (bio && cur != last_index + 1) { + if (bio && (cur != last_index + 1 || + last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); bio = NULL; @@ -1314,9 +1321,10 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); - bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); + + bio_set_dev(bio, mdev.m_bdev); + last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; @@ -1355,7 +1363,7 @@ submit_bio_retry: static void z_erofs_runqueue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, bool force_fg) + struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; @@ -1377,18 +1385,87 @@ static void z_erofs_runqueue(struct super_block *sb, z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } +/* + * Since partial uptodate is still unimplemented for now, we have to use + * approximate readmore strategies as a start. + */ +static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, + struct readahead_control *rac, + erofs_off_t end, + struct page **pagepool, + bool backmost) +{ + struct inode *inode = f->inode; + struct erofs_map_blocks *map = &f->map; + erofs_off_t cur; + int err; + + if (backmost) { + map->m_la = end; + err = z_erofs_map_blocks_iter(inode, map, + EROFS_GET_BLOCKS_READMORE); + if (err) + return; + + /* expend ra for the trailing edge if readahead */ + if (rac) { + loff_t newstart = readahead_pos(rac); + + cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); + readahead_expand(rac, newstart, cur - newstart); + return; + } + end = round_up(end, PAGE_SIZE); + } else { + end = round_up(map->m_la, PAGE_SIZE); + + if (!map->m_llen) + return; + } + + cur = map->m_la + map->m_llen - 1; + while (cur >= end) { + pgoff_t index = cur >> PAGE_SHIFT; + struct page *page; + + page = erofs_grab_cache_page_nowait(inode->i_mapping, index); + if (!page) + goto skip; + + if (PageUptodate(page)) { + unlock_page(page); + put_page(page); + goto skip; + } + + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + put_page(page); +skip: + if (cur < PAGE_SIZE) + break; + cur = (index << PAGE_SHIFT) - 1; + } +} + static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + struct page *pagepool = NULL; int err; - LIST_HEAD(pagepool); trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, + &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); + z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ @@ -1400,8 +1477,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) if (f.map.mpage) put_page(f.map.mpage); - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); return err; } @@ -1409,29 +1485,19 @@ static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - unsigned int nr_pages = readahead_count(rac); - bool sync = (sbi->ctx.readahead_sync_decompress && - nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *page, *head = NULL; - LIST_HEAD(pagepool); - - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + struct page *pagepool = NULL, *head = NULL, *page; + unsigned int nr_pages; f.readahead = true; f.headoffset = readahead_pos(rac); - while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - /* - * A pure asynchronous readahead is indicated if - * a PG_readahead marked page is hitted at first. - * Let's also do asynchronous decompression for this case. - */ - sync &= !(PageReadahead(page) && !head); + z_erofs_pcluster_readmore(&f, rac, f.headoffset + + readahead_length(rac) - 1, &pagepool, true); + nr_pages = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + while ((page = readahead_page(rac))) { set_page_private(page, (unsigned long)head); head = page; } @@ -1450,16 +1516,15 @@ static void z_erofs_readahead(struct readahead_control *rac) page->index, EROFS_I(inode)->nid); put_page(page); } - + z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); - + z_erofs_runqueue(inode->i_sb, &f, &pagepool, + sbi->opt.readahead_sync_decompress && + nr_pages <= sbi->opt.max_sync_decompress_pages); if (f.map.mpage) put_page(f.map.mpage); - - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 3a008f1b9f78..879df5362777 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -94,13 +94,6 @@ struct z_erofs_decompressqueue { } u; }; -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} - #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7a6df35fdc91..660489a7fb64 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err; + int err, headnr; erofs_off_t pos; struct page *page; void *kaddr; @@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel", - vi->z_algorithmtype[0], vi->nid); + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; goto unmap_done; } @@ -111,7 +113,7 @@ struct z_erofs_maprecorder { unsigned long lcn; /* compression extent information gathered */ - u8 type; + u8 type, headtype; u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedlcs; @@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { - if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -446,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + m->headtype = m->type; map->m_la = (lcn << lclusterbits) | m->clusterofs; break; default: @@ -471,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, int err; DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); - if (!(map->m_flags & EROFS_MAP_ZIPPED) || - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != m->headtype); + + if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1 << lclusterbits; return 0; } - lcn = m->lcn + 1; if (m->compressedlcs) goto out; @@ -499,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, switch (m->type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. @@ -554,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) DBG_BUGON(!m->delta[1] && m->clusterofs != 1 << lclusterbits); } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { /* go on until the next HEAD lcluster */ if (lcn != headlcn) break; @@ -609,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto unmap_out; - map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - if (endoff >= m.clusterofs) - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { + m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; break; } @@ -650,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode, map->m_llen = end - map->m_la; map->m_pa = blknr_to_addr(m.pblk); - map->m_flags |= EROFS_MAP_MAPPED; err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; - if (flags & EROFS_GET_BLOCKS_FIEMAP) { + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + map->m_algorithmformat = vi->z_algorithmtype[1]; + else + map->m_algorithmformat = vi->z_algorithmtype[0]; + + if ((flags & EROFS_GET_BLOCKS_FIEMAP) || + ((flags & EROFS_GET_BLOCKS_READMORE) && + map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && + map->m_llen >= EROFS_BLKSIZ)) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index ca37d4344361..1c7aa1ea4724 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -604,7 +604,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac0e11bbb445..9c5559faacda 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -915,7 +915,7 @@ const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88d5d274a868..79b6a0c47f6f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1572,7 +1572,6 @@ static const struct fscrypt_operations ext4_cryptops = { .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, - .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; @@ -4474,7 +4473,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto cantfind_ext4; /* check blocks count against device size */ - blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c1bf9ad4c220..20a083dc9042 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/moduleparam.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/lzo.h> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 78ebc306ee2b..cf049a042482 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2976,7 +2976,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, - .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, .get_num_devices = f2fs_get_num_devices, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index de0c9b013a85..a6f1c6d426d1 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb, struct fat_bios_param_block *bpb) { static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; - + sector_t bd_sects = bdev_nr_sectors(sb->s_bdev); struct fat_floppy_defaults *fdefaults = NULL; int error = -EINVAL; - sector_t bd_sects; unsigned i; - bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE; - /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { if (!silent) @@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) ret = writeback_inode(i1); if (!ret && i2) ret = writeback_inode(i2); - if (!ret) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - ret = filemap_flush(mapping); - } + if (!ret) + ret = sync_blockdev_nowait(sb->s_bdev); return ret; } EXPORT_SYMBOL_GPL(fat_flush_inodes); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 81ec192ce067..4124a89a1a5d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb, * unplug, so get our IOs out the door before we * give up the CPU. */ - blk_flush_plug(current); + if (current->plug) + blk_flush_plug(current->plug, false); cond_resched(); } @@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason) * If we are expecting writeback progress we must submit plugged IO. */ if (blk_needs_flush_plug(current)) - blk_schedule_flush_plug(current); + blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index f346a78f4bd6..6a675652129b 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -77,7 +77,6 @@ static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); -static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object); static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); @@ -907,6 +906,7 @@ static void fscache_dequeue_object(struct fscache_object *object) * @object: The object to ask about * @data: The auxiliary data for the object * @datalen: The size of the auxiliary data + * @object_size: The size of the object according to the server. * * This function consults the netfs about the coherency state of an object. * The caller must be holding a ref on cookie->n_active (held by diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 433877107700..e002cdfaf3cc 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -22,7 +22,10 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op) /** * fscache_operation_init - Do basic initialisation of an operation + * @cookie: The cookie to operate on * @op: The operation to initialise + * @processor: The function to perform the operation + * @cancel: A function to handle operation cancellation * @release: The release function to assign * * Do basic initialisation of an operation. The caller must still set flags, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 11404f8c21c7..e6039f22311b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -687,7 +687,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&fi->lock); } - io->iocb->ki_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res); } kref_put(&io->refcnt, fuse_io_release); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 319596df5dc6..f55f9f94b1a4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1121,6 +1121,9 @@ int fuse_init_fs_context_submount(struct fs_context *fsc); */ void fuse_conn_destroy(struct fuse_mount *fm); +/* Drop the connection and free the fuse mount */ +void fuse_mount_destroy(struct fuse_mount *fm); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 36cd03114b6d..12d49a1914e8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -457,14 +457,6 @@ static void fuse_send_destroy(struct fuse_mount *fm) } } -static void fuse_put_super(struct super_block *sb) -{ - struct fuse_mount *fm = get_fuse_mount_super(sb); - - fuse_conn_put(fm->fc); - kfree(fm); -} - static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) { stbuf->f_type = FUSE_SUPER_MAGIC; @@ -1003,7 +995,6 @@ static const struct super_operations fuse_super_operations = { .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, - .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, @@ -1424,20 +1415,17 @@ static int fuse_get_tree_submount(struct fs_context *fsc) if (!fm) return -ENOMEM; + fm->fc = fuse_conn_get(fc); fsc->s_fs_info = fm; sb = sget_fc(fsc, NULL, set_anon_super_fc); - if (IS_ERR(sb)) { - kfree(fm); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (IS_ERR(sb)) return PTR_ERR(sb); - } - fm->fc = fuse_conn_get(fc); /* Initialize superblock, making @mp_fi its root */ err = fuse_fill_super_submount(sb, mp_fi); if (err) { - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; deactivate_locked_super(sb); return err; } @@ -1569,8 +1557,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; int err; - struct fuse_conn *fc; - struct fuse_mount *fm; if (!ctx->file || !ctx->rootmode_present || !ctx->user_id_present || !ctx->group_id_present) @@ -1580,42 +1566,18 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ - err = -EINVAL; if ((ctx->file->f_op != &fuse_dev_operations) || (ctx->file->f_cred->user_ns != sb->s_user_ns)) - goto err; + return -EINVAL; ctx->fudptr = &ctx->file->private_data; - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err; - - fm = kzalloc(sizeof(*fm), GFP_KERNEL); - if (!fm) { - kfree(fc); - goto err; - } - - fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); - fc->release = fuse_free_conn; - - sb->s_fs_info = fm; - err = fuse_fill_super_common(sb, ctx); if (err) - goto err_put_conn; + return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); fuse_send_init(get_fuse_mount_super(sb)); return 0; - - err_put_conn: - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; - err: - return err; } /* @@ -1637,22 +1599,40 @@ static int fuse_get_tree(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; + struct fuse_conn *fc; + struct fuse_mount *fm; struct super_block *sb; int err; + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) + return -ENOMEM; + + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + + fsc->s_fs_info = fm; + if (ctx->fd_present) ctx->file = fget(ctx->fd); if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { err = get_tree_bdev(fsc, fuse_fill_super); - goto out_fput; + goto out; } /* * While block dev mount can be initialized with a dummy device fd * (found by device name), normal fuse mounts can't */ + err = -EINVAL; if (!ctx->file) - return -EINVAL; + goto out; /* * Allow creating a fuse mount with an already initialized fuse @@ -1668,7 +1648,9 @@ static int fuse_get_tree(struct fs_context *fsc) } else { err = get_tree_nodev(fsc, fuse_fill_super); } -out_fput: +out: + if (fsc->s_fs_info) + fuse_mount_destroy(fm); if (ctx->file) fput(ctx->file); return err; @@ -1747,17 +1729,25 @@ static void fuse_sb_destroy(struct super_block *sb) struct fuse_mount *fm = get_fuse_mount_super(sb); bool last; - if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) fuse_conn_destroy(fm); } } +void fuse_mount_destroy(struct fuse_mount *fm) +{ + fuse_conn_put(fm->fc); + kfree(fm); +} +EXPORT_SYMBOL(fuse_mount_destroy); + static void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuse_fs_type = { @@ -1775,6 +1765,7 @@ static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); kill_block_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuseblk_fs_type = { diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 0ad89c6629d7..94fc874f5de7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1394,12 +1394,13 @@ static void virtio_kill_sb(struct super_block *sb) bool last; /* If mount failed, we can still be called without any fc */ - if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) virtio_fs_conn_destroy(fm); } kill_anon_super(sb); + fuse_mount_destroy(fm); } static int virtio_fs_test_super(struct super_block *sb, @@ -1455,19 +1456,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); - if (fsc->s_fs_info) { - fuse_conn_put(fc); - kfree(fm); - } + if (fsc->s_fs_info) + fuse_mount_destroy(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = virtio_fs_fill_super(sb, fsc); if (err) { - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; deactivate_locked_super(sb); return err; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c559827cb6f9..5436a688157a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1338,8 +1338,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if (fl->fl_type & LOCK_MAND) - return -EOPNOTSUPP; if (fl->fl_type == F_UNLCK) { do_unflock(file, fl); @@ -1353,7 +1351,7 @@ const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, @@ -1386,7 +1384,7 @@ const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index cdf0edeeb278..5beb82652435 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb, /* default values */ *start = 0; - *size = i_size_read(sb->s_bdev->bd_inode) >> 9; + *size = bdev_nr_sectors(sb->s_bdev); if (HFS_SB(sb)->session >= 0) { struct cdrom_tocentry te; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0350dc7821bf..51ae6f1eb4a5 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb, /* default values */ *start = 0; - *size = i_size_read(sb->s_bdev->bd_inode) >> 9; + *size = bdev_nr_sectors(sb->s_bdev); if (HFSPLUS_SB(sb)->session >= 0) { struct cdrom_tocentry te; diff --git a/fs/inode.c b/fs/inode.c index ed0cab8a32db..9abc88d7959c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1782,12 +1782,13 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ -static int update_time(struct inode *inode, struct timespec64 *time, int flags) +int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) { if (inode->i_op->update_time) return inode->i_op->update_time(inode, time, flags); return generic_update_time(inode, time, flags); } +EXPORT_SYMBOL(inode_update_time); /** * atime_needs_update - update the access time @@ -1857,7 +1858,7 @@ void touch_atime(const struct path *path) * of the fs read only, e.g. subvolumes in Btrfs. */ now = current_time(inode); - update_time(inode, &now, S_ATIME); + inode_update_time(inode, &now, S_ATIME); __mnt_drop_write(mnt); skip_update: sb_end_write(inode->i_sb); @@ -2002,7 +2003,7 @@ int file_update_time(struct file *file) if (__mnt_want_write_file(file)) return 0; - ret = update_time(inode, &now, sync_it); + ret = inode_update_time(inode, &now, sync_it); __mnt_drop_write_file(file); return ret; diff --git a/fs/internal.h b/fs/internal.h index 3cd065c8a66b..cdd83d4899bb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,22 +23,11 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -extern int __sync_blockdev(struct block_device *bdev, int wait); -void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } - -static inline int __sync_blockdev(struct block_device *bdev, int wait) -{ - return 0; -} -static inline void iterate_bdevs(void (*f)(struct block_device *, void *), - void *arg) -{ -} static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; diff --git a/fs/io-wq.c b/fs/io-wq.c index 5bf8aa81715e..38b33ad9e8cf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match); +static void create_worker_cb(struct callback_head *cb); static bool io_worker_get(struct io_worker *worker) { @@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq) complete(&wq->worker_done); } +static void io_worker_cancel_cb(struct io_worker *worker) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + + atomic_dec(&acct->nr_running); + raw_spin_lock(&worker->wqe->lock); + acct->nr_workers--; + raw_spin_unlock(&worker->wqe->lock); + io_worker_ref_put(wq); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); +} + +static bool io_task_worker_match(struct callback_head *cb, void *data) +{ + struct io_worker *worker; + + if (cb->func != create_worker_cb) + return false; + worker = container_of(cb, struct io_worker, create_work); + return worker == data; +} + static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; - if (refcount_dec_and_test(&worker->ref)) - complete(&worker->ref_done); + while (1) { + struct callback_head *cb = task_work_cancel_match(wq->task, + io_task_worker_match, worker); + + if (!cb) + break; + io_worker_cancel_cb(worker); + } + + io_worker_release(worker); wait_for_completion(&worker->ref_done); raw_spin_lock(&wqe->lock); @@ -253,7 +288,7 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) pr_warn_once("io-wq is not configured for unbound workers"); raw_spin_lock(&wqe->lock); - if (acct->nr_workers == acct->max_workers) { + if (acct->nr_workers >= acct->max_workers) { raw_spin_unlock(&wqe->lock); return true; } @@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker, init_task_work(&worker->create_work, func); worker->create_index = acct->index; - if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) + if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { + clear_bit_unlock(0, &worker->create_state); return true; + } clear_bit_unlock(0, &worker->create_state); fail_release: io_worker_release(worker); @@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work) struct io_worker *worker = container_of(work, struct io_worker, work); struct io_wqe_acct *acct = io_wqe_get_acct(worker); - if (!io_queue_worker_create(worker, acct, create_worker_cont)) { - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); - } } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) @@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq) while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; - struct io_wqe_acct *acct; worker = container_of(cb, struct io_worker, create_work); - acct = io_wqe_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + io_worker_cancel_cb(worker); } rcu_read_lock(); @@ -1291,15 +1317,18 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; struct io_wqe_acct *acct; + raw_spin_lock(&wqe->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wq->wqes[node]->acct[i]; + acct = &wqe->acct[i]; prev = max_t(int, acct->max_workers, prev); if (new_count[i]) acct->max_workers = new_count[i]; new_count[i] = prev; } + raw_spin_unlock(&wqe->lock); } rcu_read_unlock(); return 0; diff --git a/fs/io-wq.h b/fs/io-wq.h index bf5c4c533760..41bf37674a49 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -29,6 +29,17 @@ struct io_wq_work_list { struct io_wq_work_node *last; }; +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_for_each_resume(pos, prv) \ + for (; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ +} while (0) + static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) @@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } +static inline void wq_list_add_head(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = list->first; + if (!node->next) + list->last = node; + WRITE_ONCE(list->first, node); +} + static inline void wq_list_cut(struct io_wq_work_list *list, struct io_wq_work_node *last, struct io_wq_work_node *prev) @@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list, last->next = NULL; } +static inline void __wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + list->last->next = to->next; + to->next = list->first; + INIT_WQ_LIST(list); +} + +static inline bool wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + if (!wq_list_empty(list)) { + __wq_list_splice(list, to); + return true; + } + return false; +} + +static inline void wq_stack_add_head(struct io_wq_work_node *node, + struct io_wq_work_node *stack) +{ + node->next = stack->next; + stack->next = node; +} + static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) @@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list, wq_list_cut(list, node, prev); } -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) +static inline +struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) +{ + struct io_wq_work_node *node = stack->next; -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ - (list)->last = NULL; \ -} while (0) + stack->next = node->next; + return node; +} struct io_wq_work { struct io_wq_work_node list; diff --git a/fs/io_uring.c b/fs/io_uring.c index 6b9e70208782..3a4af9799d9a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -103,11 +103,14 @@ #define IORING_MAX_REG_BUFFERS (1U << 14) -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) +#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC) + +#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) + #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ + REQ_F_ASYNC_DATA) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -195,8 +198,10 @@ struct io_rings { }; enum io_uring_cmd_flags { - IO_URING_F_NONBLOCK = 1, - IO_URING_F_COMPLETE_DEFER = 2, + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, }; struct io_mapped_ubuf { @@ -305,26 +310,16 @@ struct io_submit_link { }; struct io_submit_state { - struct blk_plug plug; + /* inline/task_work completion list, under ->uring_lock */ + struct io_wq_work_node free_list; + /* batch completion logic */ + struct io_wq_work_list compl_reqs; struct io_submit_link link; - /* - * io_kiocb alloc cache - */ - void *reqs[IO_REQ_CACHE_SIZE]; - unsigned int free_reqs; - bool plug_started; - - /* - * Batch completion logic - */ - struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; - unsigned int compl_nr; - /* inline/task_work completion list, under ->uring_lock */ - struct list_head free_list; - - unsigned int ios_left; + bool need_plug; + unsigned short submit_nr; + struct blk_plug plug; }; struct io_ring_ctx { @@ -368,6 +363,7 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; + int rsrc_cached_refs; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; @@ -384,7 +380,7 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ - struct list_head locked_free_list; + struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; const struct cred *sq_creds; /* cred used for __io_sq_thread() */ @@ -399,7 +395,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct eventfd_ctx *cq_ev_fd; - struct wait_queue_head poll_wait; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; @@ -417,7 +412,7 @@ struct io_ring_ctx { * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head iopoll_list; + struct io_wq_work_list iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; @@ -456,6 +451,8 @@ struct io_ring_ctx { struct work_struct exit_work; struct list_head tctx_list; struct completion ref_comp; + u32 iowq_limits[2]; + bool iowq_limits_set; }; }; @@ -578,7 +575,6 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; - struct io_buffer *kbuf; }; struct io_open { @@ -690,11 +686,6 @@ struct io_hardlink { int flags; }; -struct io_completion { - struct file *file; - u32 cflags; -}; - struct io_async_connect { struct sockaddr_storage address; }; @@ -708,11 +699,15 @@ struct io_async_msghdr { struct sockaddr_storage addr; }; -struct io_async_rw { - struct iovec fast_iov[UIO_FASTIOV]; - const struct iovec *free_iovec; +struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; + struct iovec fast_iov[UIO_FASTIOV]; +}; + +struct io_async_rw { + struct io_rw_state s; + const struct iovec *free_iovec; size_t bytes_done; struct wait_page_queue wpq; }; @@ -739,9 +734,9 @@ enum { REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, + REQ_F_ASYNC_DATA_BIT, /* keep async read/write and isreg together and in order */ - REQ_F_NOWAIT_READ_BIT, - REQ_F_NOWAIT_WRITE_BIT, + REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ @@ -782,10 +777,8 @@ enum { REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* supports async reads */ - REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), - /* supports async writes */ - REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), + /* supports async reads/writes */ + REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ @@ -794,6 +787,8 @@ enum { REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + /* ->async_data allocated */ + REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), }; struct async_poll { @@ -850,39 +845,41 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; - /* use only after cleaning per-op data, see io_clean_op() */ - struct io_completion compl; }; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; - u16 buf_index; + unsigned int flags; + + u64 user_data; u32 result; + u32 cflags; struct io_ring_ctx *ctx; - unsigned int flags; - atomic_t refs; struct task_struct *task; - u64 user_data; - struct io_kiocb *link; struct percpu_ref *fixed_rsrc_refs; + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; - /* used with ctx->iopoll_list with reads/writes */ - struct list_head inflight_entry; + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + atomic_t refs; + struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; + /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; struct io_wq_work work; + /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; - - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; }; struct io_tctx_node { @@ -900,12 +897,12 @@ struct io_defer_entry { struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; + /* should block plug */ + unsigned plug : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ unsigned unbound_nonreg_file : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; @@ -913,8 +910,8 @@ struct io_op_def { unsigned buffer_select : 1; /* do prep async if is going to be punted */ unsigned needs_async_setup : 1; - /* should block plug */ - unsigned plug : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; /* size of async data needed, if any */ unsigned short async_size; }; @@ -1078,7 +1075,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags); + s32 res, u32 cflags); static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1093,7 +1090,7 @@ static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); -static void io_submit_flush_completions(struct io_ring_ctx *ctx); +static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, @@ -1165,6 +1162,12 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { @@ -1178,13 +1181,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req) __io_req_set_refcount(req, 1); } -static inline void io_req_set_rsrc_node(struct io_kiocb *req) +#define IO_RSRC_REF_BATCH 100 + +static inline void io_req_put_rsrc_locked(struct io_kiocb *req, + struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { - struct io_ring_ctx *ctx = req->ctx; + struct percpu_ref *ref = req->fixed_rsrc_refs; + + if (ref) { + if (ref == &ctx->rsrc_node->refs) + ctx->rsrc_cached_refs++; + else + percpu_ref_put(ref); + } +} + +static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + if (req->fixed_rsrc_refs) + percpu_ref_put(req->fixed_rsrc_refs); +} + +static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (ctx->rsrc_cached_refs) { + percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + ctx->rsrc_cached_refs = 0; + } +} +static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; + percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); +} + +static inline void io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx) +{ if (!req->fixed_rsrc_refs) { req->fixed_rsrc_refs = &ctx->rsrc_node->refs; - percpu_ref_get(req->fixed_rsrc_refs); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); } } @@ -1217,6 +1259,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, return false; } +static inline bool req_has_async_data(struct io_kiocb *req) +{ + return req->flags & REQ_F_ASYNC_DATA; +} + static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -1228,7 +1275,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) req->result = res; } -static void io_ring_ctx_ref_free(struct percpu_ref *ref) +static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1240,7 +1287,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req) return !req->timeout.off; } -static void io_fallback_req_func(struct work_struct *work) +static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); @@ -1253,15 +1300,13 @@ static void io_fallback_req_func(struct work_struct *work) req->io_task_work.func(req, &locked); if (locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } percpu_ref_put(&ctx->refs); - } -static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; @@ -1298,7 +1343,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); - init_waitqueue_head(&ctx->poll_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); @@ -1307,7 +1351,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); - INIT_LIST_HEAD(&ctx->iopoll_list); + INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1316,9 +1360,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); - INIT_LIST_HEAD(&ctx->submit_state.free_list); - INIT_LIST_HEAD(&ctx->locked_free_list); + ctx->submit_state.free_list.next = NULL; + INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); + INIT_WQ_LIST(&ctx->submit_state.compl_reqs); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1346,21 +1391,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_ASYNC_READ 0x1UL -#define FFS_ASYNC_WRITE 0x2UL -#ifdef CONFIG_64BIT -#define FFS_ISREG 0x4UL -#else -#define FFS_ISREG 0x0UL -#endif -#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) static inline bool io_req_ffs_set(struct io_kiocb *req) { - return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); + return req->flags & REQ_F_FIXED_FILE; } -static void io_req_track_inflight(struct io_kiocb *req) +static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; @@ -1368,11 +1408,6 @@ static void io_req_track_inflight(struct io_kiocb *req) } } -static inline void io_unprep_linked_timeout(struct io_kiocb *req) -{ - req->flags &= ~REQ_F_LINK_TIMEOUT; -} - static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) @@ -1443,15 +1478,19 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static void io_queue_async_work(struct io_kiocb *req, bool *locked) +static inline void io_req_add_compl_list(struct io_kiocb *req) +{ + struct io_submit_state *state = &req->ctx->submit_state; + + wq_list_add_tail(&req->comp_list, &state->compl_reqs); +} + +static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; - /* must not take the lock, NULL it as a precaution */ - locked = NULL; - BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -1492,7 +1531,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -static void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, @@ -1506,7 +1545,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) } } -static void io_flush_timeouts(struct io_ring_ctx *ctx) +static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -1539,7 +1578,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used) io_flush_timeouts(ctx); @@ -1609,12 +1648,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) - wake_up_interruptible(&ctx->poll_wait); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1628,8 +1663,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) - wake_up_interruptible(&ctx->poll_wait); } /* Returns true if there are no backlogged entries after the flush */ @@ -1725,7 +1758,7 @@ static inline void io_get_task_refs(int nr) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_overflow_cqe *ocqe; @@ -1753,7 +1786,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, } static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -1776,13 +1809,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data /* not as hot to bloat with inlining */ static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { return __io_cqring_fill_event(ctx, user_data, res, cflags); } -static void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -1801,40 +1834,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res, req->link = NULL; } } + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; - } else { - if (!percpu_ref_tryget(&ctx->refs)) - req = NULL; } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - - if (req) { - io_cqring_ev_posted(ctx); - percpu_ref_put(&ctx->refs); - } -} - -static inline bool io_req_needs_clean(struct io_kiocb *req) -{ - return req->flags & IO_REQ_CLEAN_FLAGS; + io_cqring_ev_posted(ctx); } -static void io_req_complete_state(struct io_kiocb *req, long res, - unsigned int cflags) +static inline void io_req_complete_state(struct io_kiocb *req, s32 res, + u32 cflags) { - if (io_req_needs_clean(req)) - io_clean_op(req); req->result = res; - req->compl.cflags = cflags; + req->cflags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - long res, unsigned cflags) + s32 res, u32 cflags) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_state(req, res, cflags); @@ -1842,12 +1862,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } -static inline void io_req_complete(struct io_kiocb *req, long res) +static inline void io_req_complete(struct io_kiocb *req, s32 res) { __io_req_complete(req, 0, res, 0); } -static void io_req_complete_failed(struct io_kiocb *req, long res) +static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); io_req_complete_post(req, res, 0); @@ -1881,7 +1901,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_submit_state *state) { spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->locked_free_list, &state->free_list); + wq_list_splice(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; spin_unlock(&ctx->completion_lock); } @@ -1890,7 +1910,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - int nr; /* * If we have more than a batch's worth of requests in our IRQ side @@ -1899,20 +1918,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) */ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) io_flush_cached_locked_reqs(ctx, state); - - nr = state->free_reqs; - while (!list_empty(&state->free_list)) { - struct io_kiocb *req = list_first_entry(&state->free_list, - struct io_kiocb, inflight_entry); - - list_del(&req->inflight_entry); - state->reqs[nr++] = req; - if (nr == ARRAY_SIZE(state->reqs)) - break; - } - - state->free_reqs = nr; - return nr != 0; + return !!state->free_list.next; } /* @@ -1921,38 +1927,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ -static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + void *reqs[IO_REQ_ALLOC_BATCH]; + struct io_kiocb *req; int ret, i; - BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); - - if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) - goto got_req; + if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) + return true; - ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, - state->reqs); + ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, * retry single alloc to be on the safe side. */ if (unlikely(ret <= 0)) { - state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!state->reqs[0]) - return NULL; + reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!reqs[0]) + return false; ret = 1; } - for (i = 0; i < ret; i++) - io_preinit_req(state->reqs[i], ctx); - state->free_reqs = ret; -got_req: - state->free_reqs--; - return state->reqs[state->free_reqs]; + percpu_ref_get_many(&ctx->refs, ret); + for (i = 0; i < ret; i++) { + req = reqs[i]; + + io_preinit_req(req, ctx); + wq_stack_add_head(&req->comp_list, &state->free_list); + } + return true; +} + +static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +{ + if (unlikely(!ctx->submit_state.free_list.next)) + return __io_alloc_req_refill(ctx); + return true; +} + +static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +{ + struct io_wq_work_node *node; + + node = wq_stack_extract(&ctx->submit_state.free_list); + return container_of(node, struct io_kiocb, comp_list); } static inline void io_put_file(struct file *file) @@ -1961,35 +1983,28 @@ static inline void io_put_file(struct file *file) fput(file); } -static void io_dismantle_req(struct io_kiocb *req) +static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; - if (io_req_needs_clean(req)) + if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); if (!(flags & REQ_F_FIXED_FILE)) io_put_file(req->file); - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); - if (req->async_data) { - kfree(req->async_data); - req->async_data = NULL; - } } -static void __io_free_req(struct io_kiocb *req) +static __cold void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); spin_lock(&ctx->completion_lock); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; spin_unlock(&ctx->completion_lock); - - percpu_ref_put(&ctx->refs); } static inline void io_remove_next_linked(struct io_kiocb *req) @@ -2075,47 +2090,45 @@ static bool io_disarm_next(struct io_kiocb *req) return posted; } -static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) +static void __io_req_find_next_prep(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + spin_lock(&ctx->completion_lock); + posted = io_disarm_next(req); + if (posted) + io_commit_cqring(req->ctx); + spin_unlock(&ctx->completion_lock); + if (posted) + io_cqring_ev_posted(ctx); +} + +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; + if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) + return NULL; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & IO_DISARM_MASK) { - struct io_ring_ctx *ctx = req->ctx; - bool posted; - - spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); - if (posted) - io_commit_cqring(req->ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); - } + if (unlikely(req->flags & IO_DISARM_MASK)) + __io_req_find_next_prep(req); nxt = req->link; req->link = NULL; return nxt; } -static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -{ - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) - return NULL; - return __io_req_find_next(req); -} - static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; if (*locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); *locked = false; } @@ -2132,7 +2145,7 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node; - if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) + if (!tctx->task_list.first && locked) io_submit_flush_completions(ctx); spin_lock_irq(&tctx->task_lock); @@ -2195,8 +2208,9 @@ static void io_req_task_work_add(struct io_kiocb *req) * will do the job. */ notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (!task_work_add(tsk, &tctx->task_work, notify)) { - wake_up_process(tsk); + if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { + if (notify == TWA_NONE) + wake_up_process(tsk); return; } @@ -2274,77 +2288,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked) io_free_req(req); } -struct req_batch { - struct task_struct *task; - int task_refs; - int ctx_refs; -}; - -static inline void io_init_req_batch(struct req_batch *rb) +static void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) + __must_hold(&ctx->uring_lock) { - rb->task_refs = 0; - rb->ctx_refs = 0; - rb->task = NULL; -} + struct task_struct *task = NULL; + int task_refs = 0; -static void io_req_free_batch_finish(struct io_ring_ctx *ctx, - struct req_batch *rb) -{ - if (rb->ctx_refs) - percpu_ref_put_many(&ctx->refs, rb->ctx_refs); - if (rb->task) - io_put_task(rb->task, rb->task_refs); -} + do { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); -static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, - struct io_submit_state *state) -{ - io_queue_next(req); - io_dismantle_req(req); + if (unlikely(req->flags & REQ_F_REFCOUNT)) { + node = req->comp_list.next; + if (!req_ref_put_and_test(req)) + continue; + } - if (req->task != rb->task) { - if (rb->task) - io_put_task(rb->task, rb->task_refs); - rb->task = req->task; - rb->task_refs = 0; - } - rb->task_refs++; - rb->ctx_refs++; + io_req_put_rsrc_locked(req, ctx); + io_queue_next(req); + io_dismantle_req(req); - if (state->free_reqs != ARRAY_SIZE(state->reqs)) - state->reqs[state->free_reqs++] = req; - else - list_add(&req->inflight_entry, &state->free_list); + if (req->task != task) { + if (task) + io_put_task(task, task_refs); + task = req->task; + task_refs = 0; + } + task_refs++; + node = req->comp_list.next; + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + } while (node); + + if (task) + io_put_task(task, task_refs); } -static void io_submit_flush_completions(struct io_ring_ctx *ctx) +static void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { + struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - int i, nr = state->compl_nr; - struct req_batch rb; spin_lock(&ctx->completion_lock); - for (i = 0; i < nr; i++) { - struct io_kiocb *req = state->compl_reqs[i]; + wq_list_for_each(node, prev, &state->compl_reqs) { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); __io_cqring_fill_event(ctx, req->user_data, req->result, - req->compl.cflags); + req->cflags); } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_init_req_batch(&rb); - for (i = 0; i < nr; i++) { - struct io_kiocb *req = state->compl_reqs[i]; - - if (req_ref_put_and_test(req)) - io_req_free_batch(&rb, req, &ctx->submit_state); - } - - io_req_free_batch_finish(ctx, &rb); - state->compl_nr = 0; + io_free_batch_list(ctx, state->compl_reqs.first); + INIT_WQ_LIST(&state->compl_reqs); } /* @@ -2404,12 +2403,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { - struct io_buffer *kbuf; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; - return io_put_kbuf(req, kbuf); + return io_put_kbuf(req, req->kbuf); } static inline bool io_run_task_work(void) @@ -2423,50 +2419,22 @@ static inline bool io_run_task_work(void) return false; } -/* - * Find and free completed poll iocbs - */ -static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, - struct list_head *done) +static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { - struct req_batch rb; - struct io_kiocb *req; - - /* order with ->result store in io_complete_rw_iopoll() */ - smp_rmb(); - - io_init_req_batch(&rb); - while (!list_empty(done)) { - req = list_first_entry(done, struct io_kiocb, inflight_entry); - list_del(&req->inflight_entry); - - __io_cqring_fill_event(ctx, req->user_data, req->result, - io_put_rw_kbuf(req)); - (*nr_events)++; - - if (req_ref_put_and_test(req)) - io_req_free_batch(&rb, req, &ctx->submit_state); - } - - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); - io_req_free_batch_finish(ctx, &rb); -} - -static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min) -{ - struct io_kiocb *req, *tmp; - LIST_HEAD(done); - bool spin; + struct io_wq_work_node *pos, *start, *prev; + unsigned int poll_flags = BLK_POLL_NOSLEEP; + DEFINE_IO_COMP_BATCH(iob); + int nr_events = 0; /* * Only spin for completions if we don't have multiple devices hanging - * off our complete list, and we're under the requested amount. + * off our complete list. */ - spin = !ctx->poll_multi_queue && *nr_events < min; + if (ctx->poll_multi_queue || force_nonspin) + poll_flags |= BLK_POLL_ONESHOT; - list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { + wq_list_for_each(pos, start, &ctx->iopoll_list) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -2475,47 +2443,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * If we find a request that requires polling, break out * and complete those lists first, if we have entries there. */ - if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->inflight_entry, &done); - continue; - } - if (!list_empty(&done)) + if (READ_ONCE(req->iopoll_completed)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); + ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) - spin = false; + poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ - if (READ_ONCE(req->iopoll_completed)) - list_move_tail(&req->inflight_entry, &done); + if (!rq_list_empty(iob.req_list) || + READ_ONCE(req->iopoll_completed)) + break; } - if (!list_empty(&done)) - io_iopoll_complete(ctx, nr_events, &done); + if (!rq_list_empty(iob.req_list)) + iob.complete(&iob); + else if (!pos) + return 0; + + prev = start; + wq_list_for_each_resume(pos, prev) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - return 0; + /* order with io_complete_rw_iopoll(), e.g. ->result updates */ + if (!smp_load_acquire(&req->iopoll_completed)) + break; + __io_cqring_fill_event(ctx, req->user_data, req->result, + io_put_rw_kbuf(req)); + nr_events++; + } + + if (unlikely(!nr_events)) + return 0; + + io_commit_cqring(ctx); + io_cqring_ev_posted_iopoll(ctx); + pos = start ? start->next : ctx->iopoll_list.first; + wq_list_cut(&ctx->iopoll_list, prev, start); + io_free_batch_list(ctx, pos); + return nr_events; } /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) +static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->iopoll_list)) { - unsigned int nr_events = 0; - - io_do_iopoll(ctx, &nr_events, 0); - + while (!wq_list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ - if (nr_events == 0) + if (io_do_iopoll(ctx, true) == 0) break; /* * Ensure we allow local-to-the-cpu processing to take place, @@ -2562,7 +2545,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ - if (list_empty(&ctx->iopoll_list)) { + if (wq_list_empty(&ctx->iopoll_list)) { u32 tail = ctx->cached_cq_tail; mutex_unlock(&ctx->uring_lock); @@ -2571,11 +2554,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) /* some requests don't go through iopoll_list */ if (tail != ctx->cached_cq_tail || - list_empty(&ctx->iopoll_list)) + wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, &nr_events, min); - } while (!ret && nr_events < min && !need_resched()); + ret = io_do_iopoll(ctx, !min); + if (ret < 0) + break; + nr_events += ret; + ret = 0; + } while (nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); return ret; @@ -2600,9 +2587,9 @@ static bool io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *rw = req->async_data; - if (!rw) + if (!req_has_async_data(req)) return !io_req_prep_async(req); - iov_iter_restore(&rw->iter, &rw->iter_state); + iov_iter_restore(&rw->s.iter, &rw->s.iter_state); return true; } @@ -2646,7 +2633,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) { if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) { + if (unlikely(res != req->result)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; @@ -2661,16 +2648,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static void io_req_task_complete(struct io_kiocb *req, bool *locked) { unsigned int cflags = io_put_rw_kbuf(req); - long res = req->result; + int res = req->result; if (*locked) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; - io_req_complete_state(req, res, cflags); - state->compl_reqs[state->compl_nr++] = req; - if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) - io_submit_flush_completions(ctx); + io_req_add_compl_list(req); } else { io_req_complete_post(req, res, cflags); } @@ -2684,7 +2666,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); } -static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -2695,7 +2677,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_req_task_work_add(req); } -static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -2706,12 +2688,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) req->flags |= REQ_F_REISSUE; return; } + req->result = res; } - WRITE_ONCE(req->result, res); - /* order with io_iopoll_complete() checking ->result */ - smp_wmb(); - WRITE_ONCE(req->iopoll_completed, 1); + /* order with io_iopoll_complete() checking ->iopoll_completed */ + smp_store_release(&req->iopoll_completed, 1); } /* @@ -2720,13 +2701,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req) +static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - const bool in_async = io_wq_current_is_worker(); + const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; /* workqueue context doesn't hold uring_lock, grab it now */ - if (unlikely(in_async)) + if (unlikely(needs_lock)) mutex_lock(&ctx->uring_lock); /* @@ -2734,23 +2715,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->iopoll_list)) { + if (wq_list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; - unsigned int queue_num0, queue_num1; - - list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, - inflight_entry); - if (list_req->file != req->file) { + list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, + comp_list); + if (list_req->file != req->file) ctx->poll_multi_queue = true; - } else { - queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); - queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); - if (queue_num0 != queue_num1) - ctx->poll_multi_queue = true; - } } /* @@ -2758,11 +2731,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->inflight_entry, &ctx->iopoll_list); + wq_list_add_head(&req->comp_list, &ctx->iopoll_list); else - list_add_tail(&req->inflight_entry, &ctx->iopoll_list); + wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); - if (unlikely(in_async)) { + if (unlikely(needs_lock)) { /* * If IORING_SETUP_SQPOLL is enabled, sqes are either handle * in sq thread task context or in io worker task context. If @@ -2787,10 +2760,8 @@ static bool io_bdev_nowait(struct block_device *bdev) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool __io_file_supports_nowait(struct file *file, int rw) +static bool __io_file_supports_nowait(struct file *file, umode_t mode) { - umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode)) { if (IS_ENABLED(CONFIG_BLOCK) && io_bdev_nowait(I_BDEV(file->f_mapping->host))) @@ -2810,28 +2781,32 @@ static bool __io_file_supports_nowait(struct file *file, int rw) /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) return true; + return file->f_mode & FMODE_NOWAIT; +} - if (!(file->f_mode & FMODE_NOWAIT)) - return false; - - if (rw == READ) - return file->f_op->read_iter != NULL; +/* + * If we tracked the file through the SCM inflight mechanism, we could support + * any file. For now, just ensure that anything potentially problematic is done + * inline. + */ +static unsigned int io_file_get_flags(struct file *file) +{ + umode_t mode = file_inode(file)->i_mode; + unsigned int res = 0; - return file->f_op->write_iter != NULL; + if (S_ISREG(mode)) + res |= FFS_ISREG; + if (__io_file_supports_nowait(file, mode)) + res |= FFS_NOWAIT; + return res; } -static bool io_file_supports_nowait(struct io_kiocb *req, int rw) +static inline bool io_file_supports_nowait(struct io_kiocb *req) { - if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) - return true; - else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) - return true; - - return __io_file_supports_nowait(req->file, rw); + return req->flags & REQ_F_SUPPORT_NOWAIT; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - int rw) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; @@ -2839,16 +2814,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned ioprio; int ret; - if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) - req->flags |= REQ_F_ISREG; + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_pos = READ_ONCE(sqe->off); if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { req->flags |= REQ_F_CUR_POS; kiocb->ki_pos = file->f_pos; } - kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); - kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; @@ -2859,22 +2833,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw))) + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) req->flags |= REQ_F_NOWAIT; - ioprio = READ_ONCE(sqe->ioprio); - if (ioprio) { - ret = ioprio_check_cap(ioprio); - if (ret) - return ret; - - kiocb->ki_ioprio = ioprio; - } else - kiocb->ki_ioprio = get_current_ioprio(); - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || - !kiocb->ki_filp->f_op->iopoll) + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; @@ -2886,12 +2849,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_complete = io_complete_rw; } - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) { - req->imu = NULL; - io_req_set_rsrc_node(req); + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + return ret; + + kiocb->ki_ioprio = ioprio; + } else { + kiocb->ki_ioprio = get_current_ioprio(); } + req->imu = NULL; req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->buf_index = READ_ONCE(sqe->buf_index); @@ -2915,7 +2884,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) ret = -EINTR; fallthrough; default: - kiocb->ki_complete(kiocb, ret, 0); + kiocb->ki_complete(kiocb, ret); } } @@ -2926,7 +2895,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ - if (io && io->bytes_done > 0) { + if (req_has_async_data(req) && io->bytes_done > 0) { if (ret < 0) ret = io->bytes_done; else @@ -2949,7 +2918,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_ring_ctx *ctx = req->ctx; req_set_fail(req); - if (issue_flags & IO_URING_F_NONBLOCK) { + if (issue_flags & IO_URING_F_UNLOCKED) { mutex_lock(&ctx->uring_lock); __io_req_complete(req, issue_flags, ret, cflags); mutex_unlock(&ctx->uring_lock); @@ -3020,13 +2989,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { - struct io_ring_ctx *ctx = req->ctx; struct io_mapped_ubuf *imu = req->imu; u16 index, buf_index = req->buf_index; if (likely(!imu)) { + struct io_ring_ctx *ctx = req->ctx; + if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; + io_req_set_rsrc_node(req, ctx); index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = READ_ONCE(ctx->user_bufs[index]); req->imu = imu; @@ -3053,10 +3024,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) } static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, - int bgid, struct io_buffer *kbuf, - bool needs_lock) + int bgid, unsigned int issue_flags) { + struct io_buffer *kbuf = req->kbuf; struct io_buffer *head; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; if (req->flags & REQ_F_BUFFER_SELECTED) return kbuf; @@ -3077,34 +3049,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, } if (*len > kbuf->len) *len = kbuf->len; + req->flags |= REQ_F_BUFFER_SELECTED; + req->kbuf = kbuf; } else { kbuf = ERR_PTR(-ENOBUFS); } io_ring_submit_unlock(req->ctx, needs_lock); - return kbuf; } static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, - bool needs_lock) + unsigned int issue_flags) { struct io_buffer *kbuf; u16 bgid; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; bgid = req->buf_index; - kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); + kbuf = io_buffer_select(req, len, bgid, issue_flags); if (IS_ERR(kbuf)) return kbuf; - req->rw.addr = (u64) (unsigned long) kbuf; - req->flags |= REQ_F_BUFFER_SELECTED; return u64_to_user_ptr(kbuf->addr); } #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { struct compat_iovec __user *uiov; compat_ssize_t clen; @@ -3120,7 +3090,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, return -EINVAL; len = clen; - buf = io_rw_buffer_select(req, &len, needs_lock); + buf = io_rw_buffer_select(req, &len, issue_flags); if (IS_ERR(buf)) return PTR_ERR(buf); iov[0].iov_base = buf; @@ -3130,7 +3100,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, #endif static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); void __user *buf; @@ -3142,7 +3112,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, len = iov[0].iov_len; if (len < 0) return -EINVAL; - buf = io_rw_buffer_select(req, &len, needs_lock); + buf = io_rw_buffer_select(req, &len, issue_flags); if (IS_ERR(buf)) return PTR_ERR(buf); iov[0].iov_base = buf; @@ -3151,12 +3121,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { if (req->flags & REQ_F_BUFFER_SELECTED) { - struct io_buffer *kbuf; + struct io_buffer *kbuf = req->kbuf; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; iov[0].iov_base = u64_to_user_ptr(kbuf->addr); iov[0].iov_len = kbuf->len; return 0; @@ -3166,52 +3135,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, #ifdef CONFIG_COMPAT if (req->ctx->compat) - return io_compat_import(req, iov, needs_lock); + return io_compat_import(req, iov, issue_flags); #endif - return __io_iov_buffer_select(req, iov, needs_lock); + return __io_iov_buffer_select(req, iov, issue_flags); } -static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool needs_lock) +static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, + struct io_rw_state *s, + unsigned int issue_flags) { - void __user *buf = u64_to_user_ptr(req->rw.addr); - size_t sqe_len = req->rw.len; + struct iov_iter *iter = &s->iter; u8 opcode = req->opcode; + struct iovec *iovec; + void __user *buf; + size_t sqe_len; ssize_t ret; - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - *iovec = NULL; - return io_import_fixed(req, rw, iter); - } + BUILD_BUG_ON(ERR_PTR(0) != NULL); + + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) + return ERR_PTR(io_import_fixed(req, rw, iter)); /* buffer index only valid with fixed read/write, or buffer select */ - if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; + if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) + return ERR_PTR(-EINVAL); + + buf = u64_to_user_ptr(req->rw.addr); + sqe_len = req->rw.len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { - buf = io_rw_buffer_select(req, &sqe_len, needs_lock); + buf = io_rw_buffer_select(req, &sqe_len, issue_flags); if (IS_ERR(buf)) - return PTR_ERR(buf); + return ERR_CAST(buf); req->rw.len = sqe_len; } - ret = import_single_range(rw, buf, sqe_len, *iovec, iter); - *iovec = NULL; - return ret; + ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); + return ERR_PTR(ret); } + iovec = s->fast_iov; if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select(req, *iovec, needs_lock); + ret = io_iov_buffer_select(req, iovec, issue_flags); if (!ret) - iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); - *iovec = NULL; - return ret; + iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); + return ERR_PTR(ret); } - return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, + ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, req->ctx->compat); + if (unlikely(ret < 0)) + return ERR_PTR(ret); + return iovec; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct io_rw_state *s, + unsigned int issue_flags) +{ + *iovec = __io_import_iovec(rw, req, s, issue_flags); + if (unlikely(IS_ERR(*iovec))) + return PTR_ERR(*iovec); + + iov_iter_save_state(&s->iter, &s->iter_state); + return 0; } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) @@ -3236,7 +3225,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) */ if (kiocb->ki_flags & IOCB_HIPRI) return -EOPNOTSUPP; - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) && + !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; while (iov_iter_count(iter)) { @@ -3282,7 +3272,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, { struct io_async_rw *rw = req->async_data; - memcpy(&rw->iter, iter, sizeof(*iter)); + memcpy(&rw->s.iter, iter, sizeof(*iter)); rw->free_iovec = iovec; rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ @@ -3291,33 +3281,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, if (!iovec) { unsigned iov_off = 0; - rw->iter.iov = rw->fast_iov; + rw->s.iter.iov = rw->s.fast_iov; if (iter->iov != fast_iov) { iov_off = iter->iov - fast_iov; - rw->iter.iov += iov_off; + rw->s.iter.iov += iov_off; } - if (rw->fast_iov != fast_iov) - memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, + if (rw->s.fast_iov != fast_iov) + memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { req->flags |= REQ_F_NEED_CLEANUP; } } -static inline int io_alloc_async_data(struct io_kiocb *req) +static inline bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); - return req->async_data == NULL; + if (req->async_data) { + req->flags |= REQ_F_ASYNC_DATA; + return false; + } + return true; } static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, - struct iov_iter *iter, bool force) + struct io_rw_state *s, bool force) { if (!force && !io_op_defs[req->opcode].needs_async_setup) return 0; - if (!req->async_data) { + if (!req_has_async_data(req)) { struct io_async_rw *iorw; if (io_alloc_async_data(req)) { @@ -3325,10 +3318,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, return -ENOMEM; } - io_req_map_rw(req, iovec, fast_iov, iter); + io_req_map_rw(req, iovec, s->fast_iov, &s->iter); iorw = req->async_data; /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->iter, &iorw->iter_state); + iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); } return 0; } @@ -3336,10 +3329,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { struct io_async_rw *iorw = req->async_data; - struct iovec *iov = iorw->fast_iov; + struct iovec *iov; int ret; - ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); + /* submission path, ->uring_lock should already be taken */ + ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); if (unlikely(ret < 0)) return ret; @@ -3347,7 +3341,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) iorw->free_iovec = iov; if (iov) req->flags |= REQ_F_NEED_CLEANUP; - iov_iter_save_state(&iorw->iter, &iorw->iter_state); return 0; } @@ -3355,11 +3348,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - return io_prep_rw(req, sqe, READ); + return io_prep_rw(req, sqe); } /* - * This is our waitqueue callback handler, registered through lock_page_async() + * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. * This gets called when the page is unlocked, and we generally expect that to * happen when the page IO is completed and the page is now uptodate. This will @@ -3431,7 +3424,7 @@ static bool io_rw_should_retry(struct io_kiocb *req) static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) { - if (req->file->f_op->read_iter) + if (likely(req->file->f_op->read_iter)) return call_read_iter(req->file, &req->rw.kiocb, iter); else if (req->file->f_op->read) return loop_rw_iter(READ, req, iter); @@ -3447,43 +3440,40 @@ static bool need_read_all(struct io_kiocb *req) static int io_read(struct io_kiocb *req, unsigned int issue_flags) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter __iter, *iter = &__iter; - struct io_async_rw *rw = req->async_data; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct iov_iter_state __state, *state; + struct io_async_rw *rw; ssize_t ret, ret2; - if (rw) { - iter = &rw->iter; - state = &rw->iter_state; + if (!req_has_async_data(req)) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } else { + rw = req->async_data; + s = &rw->s; /* * We come here from an earlier attempt, restore our state to * match in case it doesn't. It's cheap enough that we don't * need to make this conditional. */ - iov_iter_restore(iter, state); + iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; - } else { - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; - state = &__state; - iov_iter_save_state(iter, state); } - req->result = iov_iter_count(iter); + req->result = iov_iter_count(&s->iter); - /* Ensure we clear previously set non-block flag */ - if (!force_nonblock) - kiocb->ki_flags &= ~IOCB_NOWAIT; - else + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) { + ret = io_setup_async_rw(req, iovec, s, true); + return ret ?: -EAGAIN; + } kiocb->ki_flags |= IOCB_NOWAIT; - - /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_nowait(req, READ)) { - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); - return ret ?: -EAGAIN; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; } ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); @@ -3492,7 +3482,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return ret; } - ret = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, &s->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; @@ -3505,7 +3495,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } else if (ret == -EIOCBQUEUED) { goto out_free; - } else if (ret <= 0 || ret == req->result || !force_nonblock || + } else if (ret == req->result || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; @@ -3516,22 +3506,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ - iov_iter_restore(iter, state); + iov_iter_restore(&s->iter, &s->iter_state); - ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); + ret2 = io_setup_async_rw(req, iovec, s, true); if (ret2) return ret2; iovec = NULL; rw = req->async_data; + s = &rw->s; /* * Now use our persistent iterator and state, if we aren't already. * We've restored and mapped the iter to match. */ - if (iter != &rw->iter) { - iter = &rw->iter; - state = &rw->iter_state; - } do { /* @@ -3539,11 +3526,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * above or inside this loop. Advance the iter by the bytes * that were consumed. */ - iov_iter_advance(iter, ret); - if (!iov_iter_count(iter)) + iov_iter_advance(&s->iter, ret); + if (!iov_iter_count(&s->iter)) break; rw->bytes_done += ret; - iov_iter_save_state(iter, state); + iov_iter_save_state(&s->iter, &s->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -3557,12 +3544,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, &s->iter); if (ret == -EIOCBQUEUED) return 0; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(iter, state); + iov_iter_restore(&s->iter, &s->iter_state); } while (ret > 0); done: kiocb_done(kiocb, ret, issue_flags); @@ -3577,47 +3564,46 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - return io_prep_rw(req, sqe, WRITE); + req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); + return io_prep_rw(req, sqe); } static int io_write(struct io_kiocb *req, unsigned int issue_flags) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter __iter, *iter = &__iter; - struct io_async_rw *rw = req->async_data; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct iov_iter_state __state, *state; ssize_t ret, ret2; - if (rw) { - iter = &rw->iter; - state = &rw->iter_state; - iov_iter_restore(iter, state); - iovec = NULL; - } else { - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); - if (ret < 0) + if (!req_has_async_data(req)) { + ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) return ret; - state = &__state; - iov_iter_save_state(iter, state); + } else { + struct io_async_rw *rw = req->async_data; + + s = &rw->s; + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; } - req->result = iov_iter_count(iter); + req->result = iov_iter_count(&s->iter); - /* Ensure we clear previously set non-block flag */ - if (!force_nonblock) - kiocb->ki_flags &= ~IOCB_NOWAIT; - else - kiocb->ki_flags |= IOCB_NOWAIT; + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) + goto copy_iov; - /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_nowait(req, WRITE)) - goto copy_iov; + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) + goto copy_iov; - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; + kiocb->ki_flags |= IOCB_NOWAIT; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; + } ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); if (unlikely(ret)) @@ -3637,10 +3623,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) } kiocb->ki_flags |= IOCB_WRITE; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, iter); + if (likely(req->file->f_op->write_iter)) + ret2 = call_write_iter(req->file, kiocb, &s->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req, iter); + ret2 = loop_rw_iter(WRITE, req, &s->iter); else ret2 = -EINVAL; @@ -3660,14 +3646,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; done: kiocb_done(kiocb, ret2, issue_flags); } else { copy_iov: - iov_iter_restore(iter, state); - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); + iov_iter_restore(&s->iter, &s->iter_state); + ret = io_setup_async_rw(req, iovec, s, false); return ret ?: -EAGAIN; } out_free: @@ -3803,7 +3789,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, return 0; } -static int io_mkdirat(struct io_kiocb *req, int issue_flags) +static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { struct io_mkdir *mkd = &req->mkdir; int ret; @@ -3852,7 +3838,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, return 0; } -static int io_symlinkat(struct io_kiocb *req, int issue_flags) +static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_symlink *sl = &req->symlink; int ret; @@ -3902,7 +3888,7 @@ static int io_linkat_prep(struct io_kiocb *req, return 0; } -static int io_linkat(struct io_kiocb *req, int issue_flags) +static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_hardlink *lnk = &req->hardlink; int ret; @@ -4321,9 +4307,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head; int ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); lockdep_assert_held(&ctx->uring_lock); @@ -4336,7 +4322,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); return 0; } @@ -4408,9 +4394,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head, *list; int ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); lockdep_assert_held(&ctx->uring_lock); @@ -4426,7 +4412,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); return 0; } @@ -4759,8 +4745,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = req->async_data; - if (!kmsg) { + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4919,23 +4906,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - bool needs_lock) + unsigned int issue_flags) { struct io_sr_msg *sr = &req->sr_msg; - struct io_buffer *kbuf; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); - if (IS_ERR(kbuf)) - return kbuf; - - sr->kbuf = kbuf; - req->flags |= REQ_F_BUFFER_SELECTED; - return kbuf; + return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); } static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) { - return io_put_kbuf(req, req->sr_msg.kbuf); + return io_put_kbuf(req, req->kbuf); } static int io_recvmsg_prep_async(struct io_kiocb *req) @@ -4983,8 +4963,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = req->async_data; - if (!kmsg) { + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4992,7 +4973,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) } if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, !force_nonblock); + kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); @@ -5044,7 +5025,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, !force_nonblock); + kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); @@ -5175,7 +5156,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (req->async_data) { + if (req_has_async_data(req)) { io = req->async_data; } else { ret = move_addr_to_kernel(req->connect.addr, @@ -5191,7 +5172,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_connect_file(req->file, &io->address, req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req->async_data) + if (req_has_async_data(req)) return -EAGAIN; if (io_alloc_async_data(req)) { ret = -ENOMEM; @@ -5351,16 +5332,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) - __must_hold(&req->ctx->completion_lock) -{ - bool done; - - done = __io_poll_complete(req, mask); - io_commit_cqring(req->ctx); - return done; -} - static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; @@ -5482,7 +5453,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); req_ref_get(req); poll->wait.private = req; + *poll_ptr = poll; + if (req->opcode == IORING_OP_POLL_ADD) + req->flags |= REQ_F_ASYNC_DATA; } pt->nr_entries++; @@ -5606,17 +5580,13 @@ static int io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; - int rw; - if (!req->file || !file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if (req->flags & REQ_F_POLLED) - return IO_APOLL_ABORTED; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; + if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) + return IO_APOLL_ABORTED; if (def->pollin) { - rw = READ; mask |= POLLIN | POLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ @@ -5624,14 +5594,9 @@ static int io_arm_poll_handler(struct io_kiocb *req) (req->sr_msg.msg_flags & MSG_ERRQUEUE)) mask &= ~POLLIN; } else { - rw = WRITE; mask |= POLLOUT | POLLWRNORM; } - /* if we can't nonblock try, then no point in arming a poll handler */ - if (!io_file_supports_nowait(req, rw)) - return IO_APOLL_ABORTED; - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return IO_APOLL_ABORTED; @@ -5692,8 +5657,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) /* * Returns true if we found and killed one or more poll requests */ -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, + struct task_struct *tsk, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5847,7 +5812,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - done = io_poll_complete(req, mask); + done = __io_poll_complete(req, mask); + io_commit_cqring(req->ctx); } spin_unlock(&ctx->completion_lock); @@ -5923,7 +5889,10 @@ err: static void io_req_task_timeout(struct io_kiocb *req, bool *locked) { - req_set_fail(req); + struct io_timeout_data *data = req->async_data; + + if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(req); io_req_complete_post(req, -ETIME, 0); } @@ -6129,7 +6098,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | + IORING_TIMEOUT_ETIME_SUCCESS)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) @@ -6140,7 +6110,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; - if (!req->async_data && io_alloc_async_data(req)) + if (WARN_ON_ONCE(req_has_async_data(req))) + return -EFAULT; + if (io_alloc_async_data(req)) return -ENOMEM; data = req->async_data; @@ -6297,6 +6269,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; u64 sqe_addr = req->cancel.addr; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_tctx_node *node; int ret; @@ -6305,7 +6278,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) goto done; /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_lock(ctx, needs_lock); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; @@ -6314,7 +6287,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) if (ret != -ENOENT) break; } - io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_unlock(ctx, needs_lock); done: if (ret < 0) req_set_fail(req); @@ -6341,6 +6314,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_uring_rsrc_update2 up; int ret; @@ -6350,10 +6324,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.tags = 0; up.resv = 0; - io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_lock(ctx, needs_lock); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_unlock(ctx, needs_lock); if (ret < 0) req_set_fail(req); @@ -6449,7 +6423,7 @@ static int io_req_prep_async(struct io_kiocb *req) { if (!io_op_defs[req->opcode].needs_async_setup) return 0; - if (WARN_ON_ONCE(req->async_data)) + if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (io_alloc_async_data(req)) return -EAGAIN; @@ -6481,68 +6455,39 @@ static u32 io_get_sequence(struct io_kiocb *req) return seq; } -static bool io_drain_req(struct io_kiocb *req) +static __cold void io_drain_req(struct io_kiocb *req) { - struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; - u32 seq; - - if (req->flags & REQ_F_FAIL) { - io_req_complete_fail_submit(req); - return true; - } - - /* - * If we need to drain a request in the middle of a link, drain the - * head request and the next request/link after the current link. - * Considering sequential execution of links, IOSQE_IO_DRAIN will be - * maintained for every request of our link. - */ - if (ctx->drain_next) { - req->flags |= REQ_F_IO_DRAIN; - ctx->drain_next = false; - } - /* not interested in head, start from the first linked */ - io_for_each_link(pos, req->link) { - if (pos->flags & REQ_F_IO_DRAIN) { - ctx->drain_next = true; - req->flags |= REQ_F_IO_DRAIN; - break; - } - } + u32 seq = io_get_sequence(req); /* Still need defer if there is pending req in defer list. */ - if (likely(list_empty_careful(&ctx->defer_list) && - !(req->flags & REQ_F_IO_DRAIN))) { + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { +queue: ctx->drain_active = false; - return false; + io_req_task_queue(req); + return; } - seq = io_get_sequence(req); - /* Still a chance to pass the sequence check */ - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) - return false; - ret = io_req_prep_async(req); - if (ret) - goto fail; + if (ret) { +fail: + io_req_complete_failed(req, ret); + return; + } io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { ret = -ENOMEM; -fail: - io_req_complete_failed(req, ret); - return true; + goto fail; } spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); kfree(de); - io_queue_async_work(req, NULL); - return true; + goto queue; } trace_io_uring_defer(ctx, req, req->user_data); @@ -6550,23 +6495,13 @@ fail: de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock(&ctx->completion_lock); - return true; } static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - kfree((void *)(unsigned long)req->rw.addr); - break; - case IORING_OP_RECVMSG: - case IORING_OP_RECV: - kfree(req->sr_msg.kbuf); - break; - } + kfree(req->kbuf); + req->kbuf = NULL; } if (req->flags & REQ_F_NEED_CLEANUP) { @@ -6631,17 +6566,19 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_CREDS) put_cred(req->creds); - + if (req->flags & REQ_F_ASYNC_DATA) { + kfree(req->async_data); + req->async_data = NULL; + } req->flags &= ~IO_REQ_CLEAN_FLAGS; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; const struct cred *creds = NULL; int ret; - if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); switch (req->opcode) { @@ -6764,8 +6701,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret) return ret; /* If the op doesn't have a file, we're not polling for it */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) - io_iopoll_req_issued(req); + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) + io_iopoll_req_issued(req, issue_flags); return 0; } @@ -6781,6 +6718,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + unsigned int issue_flags = IO_URING_F_UNLOCKED; + bool needs_poll = false; struct io_kiocb *timeout; int ret = 0; @@ -6795,23 +6734,42 @@ static void io_wq_submit_work(struct io_wq_work *work) io_queue_linked_timeout(timeout); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) - ret = -ECANCELED; + if (work->flags & IO_WQ_WORK_CANCEL) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } - if (!ret) { - do { - ret = io_issue_sqe(req, 0); - /* - * We can get EAGAIN for polled IO even though we're - * forcing a sync submission from here, since we can't - * wait for request slots on the block side. - */ - if (ret != -EAGAIN) - break; - cond_resched(); - } while (1); + if (req->flags & REQ_F_FORCE_ASYNC) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + bool opcode_poll = def->pollin || def->pollout; + + if (opcode_poll && file_can_poll(req->file)) { + needs_poll = true; + issue_flags |= IO_URING_F_NONBLOCK; + } } + do { + ret = io_issue_sqe(req, issue_flags); + if (ret != -EAGAIN) + break; + /* + * We can get EAGAIN for iopolled IO even though we're + * forcing a sync submission from here, since we can't + * wait for request slots on the block side. + */ + if (!needs_poll) { + cond_resched(); + continue; + } + + if (io_arm_poll_handler(req) == IO_APOLL_OK) + return; + /* aborted or ready, in either case retry blocking */ + needs_poll = false; + issue_flags &= ~IO_URING_F_NONBLOCK; + } while (1); + /* avoid locking problems by failing it from a clean context */ if (ret) io_req_task_queue_fail(req, ret); @@ -6835,12 +6793,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file { unsigned long file_ptr = (unsigned long) file; - if (__io_file_supports_nowait(file, READ)) - file_ptr |= FFS_ASYNC_READ; - if (__io_file_supports_nowait(file, WRITE)) - file_ptr |= FFS_ASYNC_WRITE; - if (S_ISREG(file_inode(file)->i_mode)) - file_ptr |= FFS_ISREG; + file_ptr |= io_file_get_flags(file); file_slot->file_ptr = file_ptr; } @@ -6857,8 +6810,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, file = (struct file *) (file_ptr & FFS_MASK); file_ptr &= ~FFS_MASK; /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); - io_req_set_rsrc_node(req); + req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); + io_req_set_rsrc_node(req, ctx); return file; } @@ -6950,67 +6903,66 @@ static void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static void __io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe_arm_apoll(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) +{ + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + + switch (io_arm_poll_handler(req)) { + case IO_APOLL_READY: + if (linked_timeout) { + io_queue_linked_timeout(linked_timeout); + linked_timeout = NULL; + } + io_req_task_queue(req); + break; + case IO_APOLL_ABORTED: + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req, NULL); + break; + } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); +} + +static inline void __io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { struct io_kiocb *linked_timeout; int ret; -issue_sqe: ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + if (req->flags & REQ_F_COMPLETE_INLINE) { + io_req_add_compl_list(req); + return; + } /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (likely(!ret)) { - if (req->flags & REQ_F_COMPLETE_INLINE) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; - - state->compl_reqs[state->compl_nr++] = req; - if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) - io_submit_flush_completions(ctx); - return; - } - linked_timeout = io_prep_linked_timeout(req); if (linked_timeout) io_queue_linked_timeout(linked_timeout); } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - linked_timeout = io_prep_linked_timeout(req); - - switch (io_arm_poll_handler(req)) { - case IO_APOLL_READY: - if (linked_timeout) - io_unprep_linked_timeout(req); - goto issue_sqe; - case IO_APOLL_ABORTED: - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req, NULL); - break; - } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); + io_queue_sqe_arm_apoll(req); } else { io_req_complete_failed(req, ret); } } -static inline void io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - if (unlikely(req->ctx->drain_active) && io_drain_req(req)) - return; - - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { - __io_queue_sqe(req); - } else if (req->flags & REQ_F_FAIL) { + if (req->flags & REQ_F_FAIL) { io_req_complete_fail_submit(req); + } else if (unlikely(req->ctx->drain_active)) { + io_drain_req(req); } else { int ret = io_req_prep_async(req); @@ -7021,6 +6973,15 @@ static inline void io_queue_sqe(struct io_kiocb *req) } } +static inline void io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) +{ + if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) + __io_queue_sqe(req); + else + io_queue_sqe_fallback(req); +} + /* * Check SQE restrictions (opcode and flags). * @@ -7030,9 +6991,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (likely(!ctx->restricted)) - return true; - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -7047,16 +7005,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } +static void io_init_req_drain(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *head = ctx->submit_state.link.head; + + ctx->drain_active = true; + if (head) { + /* + * If we need to drain a request in the middle of a link, drain + * the head request and the next request/link after the current + * link. Considering sequential execution of links, + * IOSQE_IO_DRAIN will be maintained for every request of our + * link. + */ + head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + ctx->drain_next = true; + } +} + static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { - struct io_submit_state *state; unsigned int sqe_flags; - int personality, ret = 0; + int personality; + u8 opcode; /* req is partially pre-initialised, see io_preinit_req() */ - req->opcode = READ_ONCE(sqe->opcode); + req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); @@ -7064,19 +7041,52 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->fixed_rsrc_refs = NULL; req->task = current; - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) - return -EINVAL; - if (unlikely(req->opcode >= IORING_OP_LAST)) + if (unlikely(opcode >= IORING_OP_LAST)) { + req->opcode = 0; return -EINVAL; - if (!io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; + } + if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { + /* enforce forwards compatibility on users */ + if (sqe_flags & ~SQE_VALID_FLAGS) + return -EINVAL; + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[opcode].buffer_select) + return -EOPNOTSUPP; + if (sqe_flags & IOSQE_IO_DRAIN) + io_init_req_drain(req); + } + if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { + if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) + return -EACCES; + /* knock it to the slow queue path, will be drained there */ + if (ctx->drain_active) + req->flags |= REQ_F_FORCE_ASYNC; + /* if there is no link, we're at "next" request and need to drain */ + if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { + ctx->drain_next = false; + ctx->drain_active = true; + req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + } + } - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) - return -EOPNOTSUPP; - if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) - ctx->drain_active = true; + if (io_op_defs[opcode].needs_file) { + struct io_submit_state *state = &ctx->submit_state; + + /* + * Plug now if we have more than 2 IO left after this, and the + * target is potentially a read/write to block based storage. + */ + if (state->need_plug && io_op_defs[opcode].plug) { + state->plug_started = true; + state->need_plug = false; + blk_start_plug_nr_ios(&state->plug, state->submit_nr); + } + + req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), + (sqe_flags & IOSQE_FIXED_FILE)); + if (unlikely(!req->file)) + return -EBADF; + } personality = READ_ONCE(sqe->personality); if (personality) { @@ -7086,27 +7096,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, get_cred(req->creds); req->flags |= REQ_F_CREDS; } - state = &ctx->submit_state; - /* - * Plug now if we have more than 1 IO left after this, and the target - * is potentially a read/write to block based storage. - */ - if (!state->plug_started && state->ios_left > 1 && - io_op_defs[req->opcode].plug) { - blk_start_plug(&state->plug); - state->plug_started = true; - } - - if (io_op_defs[req->opcode].needs_file) { - req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), - (sqe_flags & IOSQE_FIXED_FILE)); - if (unlikely(!req->file)) - ret = -EBADF; - } - - state->ios_left--; - return ret; + return io_req_prep(req, sqe); } static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, @@ -7118,7 +7109,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { -fail_req: + trace_io_uring_req_failed(sqe, ret); + /* fail even hard links since we don't submit */ if (link->head) { /* @@ -7141,10 +7133,6 @@ fail_req: return ret; } req_fail_link_node(req, ret); - } else { - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; } /* don't need @sqe from now on */ @@ -7174,33 +7162,32 @@ fail_req: link->last->link = req; link->last = req; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + return 0; /* last request of a link, enqueue the link */ - if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - link->head = NULL; - io_queue_sqe(head); - } - } else { - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - link->head = req; - link->last = req; - } else { - io_queue_sqe(req); - } + link->head = NULL; + req = head; + } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + link->head = req; + link->last = req; + return 0; } + io_queue_sqe(req); return 0; } /* * Batched submission is done, ensure local IO is flushed out. */ -static void io_submit_state_end(struct io_submit_state *state, - struct io_ring_ctx *ctx) +static void io_submit_state_end(struct io_ring_ctx *ctx) { + struct io_submit_state *state = &ctx->submit_state; + if (state->link.head) io_queue_sqe(state->link.head); - if (state->compl_nr) - io_submit_flush_completions(ctx); + /* flush only after queuing links as they can generate completions */ + io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); } @@ -7212,7 +7199,8 @@ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { state->plug_started = false; - state->ios_left = max_ios; + state->need_plug = max_ios > 2; + state->submit_nr = max_ios; /* set only head, no need to init link_last in advance */ state->link.head = NULL; } @@ -7264,45 +7252,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { + unsigned int entries = io_sqring_entries(ctx); int submitted = 0; + if (unlikely(!entries)) + return 0; /* make sure SQ entry isn't read before tail */ - nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); - if (!percpu_ref_tryget_many(&ctx->refs, nr)) - return -EAGAIN; + nr = min3(nr, ctx->sq_entries, entries); io_get_task_refs(nr); io_submit_state_start(&ctx->submit_state, nr); - while (submitted < nr) { + do { const struct io_uring_sqe *sqe; struct io_kiocb *req; - req = io_alloc_req(ctx); - if (unlikely(!req)) { + if (unlikely(!io_alloc_req_refill(ctx))) { if (!submitted) submitted = -EAGAIN; break; } + req = io_alloc_req(ctx); sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - list_add(&req->inflight_entry, &ctx->submit_state.free_list); + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); break; } /* will complete beyond this point, count as submitted */ submitted++; if (io_submit_sqe(ctx, req, sqe)) break; - } + } while (submitted < nr); if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; int unused = nr - ref_used; current->io_uring->cached_refs += unused; - percpu_ref_put_many(&ctx->refs, unused); } - io_submit_state_end(&ctx->submit_state, ctx); + io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -7341,16 +7329,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (!list_empty(&ctx->iopoll_list) || to_submit) { - unsigned nr_events = 0; + if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { const struct cred *creds = NULL; if (ctx->sq_creds != current_cred()) creds = override_creds(ctx->sq_creds); mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0); + if (!wq_list_empty(&ctx->iopoll_list)) + io_do_iopoll(ctx, true); /* * Don't submit if refs are dying, good for io_uring_register(), @@ -7370,7 +7357,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) return ret; } -static void io_sqd_update_thread_idle(struct io_sq_data *sqd) +static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) { struct io_ring_ctx *ctx; unsigned sq_thread_idle = 0; @@ -7427,7 +7414,7 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, cap_entries); - if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) + if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) sqt_spin = true; } if (io_run_task_work()) @@ -7448,7 +7435,7 @@ static int io_sq_thread(void *data) io_ring_set_wakeup_flag(ctx); if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { + !wq_list_empty(&ctx->iopoll_list)) { needs_sched = false; break; } @@ -7624,7 +7611,7 @@ static void io_free_page_table(void **table, size_t size) kfree(table); } -static void **io_alloc_page_table(size_t size) +static __cold void **io_alloc_page_table(size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); size_t init_size = size; @@ -7653,7 +7640,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) kfree(ref_node); } -static void io_rsrc_node_ref_zero(struct percpu_ref *ref) +static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) { struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); struct io_ring_ctx *ctx = node->rsrc_data->ctx; @@ -7699,10 +7686,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) static void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) + __must_hold(&ctx->uring_lock) { WARN_ON_ONCE(!ctx->rsrc_backup_node); WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + io_rsrc_refs_drop(ctx); + if (data_to_kill) { struct io_rsrc_node *rsrc_node = ctx->rsrc_node; @@ -7730,7 +7720,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) return ctx->rsrc_backup_node ? 0 : -ENOMEM; } -static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) +static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, + struct io_ring_ctx *ctx) { int ret; @@ -7786,9 +7777,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data) kfree(data); } -static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, - u64 __user *utags, unsigned nr, - struct io_rsrc_data **pdata) +static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, + u64 __user *utags, unsigned nr, + struct io_rsrc_data **pdata) { struct io_rsrc_data *data; int ret = -ENOMEM; @@ -8356,12 +8347,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) { struct io_ring_ctx *ctx = req->ctx; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; bool needs_switch = false; struct io_fixed_file *file_slot; int ret = -EBADF; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); if (file->f_op == &io_uring_fops) goto err; ret = -ENXIO; @@ -8402,7 +8393,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); if (ret) fput(file); return ret; @@ -8412,11 +8403,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { unsigned int offset = req->close.file_slot - 1; struct io_ring_ctx *ctx = req->ctx; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_fixed_file *file_slot; struct file *file; int ret, i; - io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_lock(ctx, needs_lock); ret = -ENXIO; if (unlikely(!ctx->file_data)) goto out; @@ -8442,7 +8434,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) io_rsrc_node_switch(ctx, ctx->file_data); ret = 0; out: - io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_unlock(ctx, needs_lock); return ret; } @@ -8558,8 +8550,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, return io_wq_create(concurrency, &data); } -static int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +static __cold int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -8606,8 +8598,8 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static int io_sq_offload_create(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; @@ -9218,29 +9210,25 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) } } -static void io_req_cache_free(struct list_head *list) -{ - struct io_kiocb *req, *nxt; - - list_for_each_entry_safe(req, nxt, list, inflight_entry) { - list_del(&req->inflight_entry); - kmem_cache_free(req_cachep, req); - } -} - static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; + int nr = 0; mutex_lock(&ctx->uring_lock); + io_flush_cached_locked_reqs(ctx, state); - if (state->free_reqs) { - kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); - state->free_reqs = 0; - } + while (state->free_list.next) { + struct io_wq_work_node *node; + struct io_kiocb *req; - io_flush_cached_locked_reqs(ctx, state); - io_req_cache_free(&state->free_list); + node = wq_stack_extract(&state->free_list); + req = container_of(node, struct io_kiocb, comp_list); + kmem_cache_free(req_cachep, req); + nr++; + } + if (nr) + percpu_ref_put_many(&ctx->refs, nr); mutex_unlock(&ctx->uring_lock); } @@ -9250,7 +9238,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } -static void io_ring_ctx_free(struct io_ring_ctx *ctx) +static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -9259,6 +9247,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } + io_rsrc_refs_drop(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ io_wait_rsrc_data(ctx->buf_data); io_wait_rsrc_data(ctx->file_data); @@ -9282,6 +9271,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->rsrc_backup_node) io_rsrc_node_destroy(ctx->rsrc_backup_node); flush_delayed_work(&ctx->rsrc_put_work); + flush_delayed_work(&ctx->fallback_work); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); @@ -9312,7 +9302,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->poll_wait, wait); + poll_wait(file, &ctx->cq_wait, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -9359,7 +9349,7 @@ struct io_tctx_exit { struct io_ring_ctx *ctx; }; -static void io_tctx_exit_cb(struct callback_head *cb) +static __cold void io_tctx_exit_cb(struct callback_head *cb) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_exit *work; @@ -9374,14 +9364,14 @@ static void io_tctx_exit_cb(struct callback_head *cb) complete(&work->completion); } -static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); return req->ctx == data; } -static void io_ring_exit_work(struct work_struct *work) +static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; @@ -9410,6 +9400,8 @@ static void io_ring_exit_work(struct work_struct *work) io_sq_thread_unpark(sqd); } + io_req_caches_free(ctx); + if (WARN_ON_ONCE(time_after(jiffies, timeout))) { /* there is little hope left, don't run it too often */ interval = HZ * 60; @@ -9436,7 +9428,6 @@ static void io_ring_exit_work(struct work_struct *work) ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); if (WARN_ON_ONCE(ret)) continue; - wake_up_process(node->task); mutex_unlock(&ctx->uring_lock); wait_for_completion(&exit.completion); @@ -9450,8 +9441,8 @@ static void io_ring_exit_work(struct work_struct *work) } /* Returns true if we found and killed one or more timeouts */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, + struct task_struct *tsk, bool cancel_all) { struct io_kiocb *req, *tmp; int canceled = 0; @@ -9473,7 +9464,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, return canceled != 0; } -static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) +static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; struct creds *creds; @@ -9535,8 +9526,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return ret; } -static bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) +static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, + struct task_struct *task, + bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); @@ -9561,7 +9553,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx, return true; } -static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) { struct io_tctx_node *node; enum io_wq_cancel cret; @@ -9585,9 +9577,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) +static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + bool cancel_all) { struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; @@ -9611,7 +9603,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!list_empty_careful(&ctx->iopoll_list)) { + while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; } @@ -9638,7 +9630,16 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; + tctx = current->io_uring; + if (ctx->iowq_limits_set) { + unsigned int limits[2] = { ctx->iowq_limits[0], + ctx->iowq_limits[1], }; + + ret = io_wq_max_workers(tctx->io_wq, limits); + if (ret) + return ret; + } } if (!xa_load(&tctx->xa, (unsigned long)ctx)) { node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -9677,7 +9678,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_tctx_node(unsigned long index) +static __cold void io_uring_del_tctx_node(unsigned long index) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -9700,7 +9701,7 @@ static void io_uring_del_tctx_node(unsigned long index) kfree(node); } -static void io_uring_clean_tctx(struct io_uring_task *tctx) +static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) { struct io_wq *wq = tctx->io_wq; struct io_tctx_node *node; @@ -9727,7 +9728,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) return percpu_counter_sum(&tctx->inflight); } -static void io_uring_drop_tctx_refs(struct task_struct *task) +static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -9743,7 +9744,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task) * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. */ -static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) +static __cold void io_uring_cancel_generic(bool cancel_all, + struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; @@ -9836,7 +9838,7 @@ static void *io_uring_validate_mmap_request(struct file *file, #ifdef CONFIG_MMU -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { size_t sz = vma->vm_end - vma->vm_start; unsigned long pfn; @@ -10021,7 +10023,7 @@ out_fput: } #ifdef CONFIG_PROC_FS -static int io_uring_show_cred(struct seq_file *m, unsigned int id, +static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { struct user_namespace *uns = seq_user_ns(m); @@ -10053,11 +10055,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } -static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) +static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) { struct io_sq_data *sq = NULL; + struct io_overflow_cqe *ocqe; + struct io_rings *r = ctx->rings; + unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; + unsigned int sq_head = READ_ONCE(r->sq.head); + unsigned int sq_tail = READ_ONCE(r->sq.tail); + unsigned int cq_head = READ_ONCE(r->cq.head); + unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int sq_entries, cq_entries; bool has_lock; - int i; + unsigned int i; + + /* + * we may get imprecise sqe and cqe info if uring is actively running + * since we get cached_sq_head and cached_cq_tail without uring_lock + * and sq_tail and cq_head are changed by userspace. But it's ok since + * we usually use these info when it is stuck. + */ + seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask); + seq_printf(m, "SqHead:\t%u\n", sq_head); + seq_printf(m, "SqTail:\t%u\n", sq_tail); + seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CqMask:\t0x%x\n", cq_mask); + seq_printf(m, "CqHead:\t%u\n", cq_head); + seq_printf(m, "CqTail:\t%u\n", cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); + sq_entries = min(sq_tail - sq_head, ctx->sq_entries); + for (i = 0; i < sq_entries; i++) { + unsigned int entry = i + sq_head; + unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx]; + + if (sq_idx > sq_mask) + continue; + sqe = &ctx->sq_sqes[sq_idx]; + seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", + sq_idx, sqe->opcode, sqe->fd, sqe->flags, + sqe->user_data); + } + seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); + cq_entries = min(cq_tail - cq_head, ctx->cq_entries); + for (i = 0; i < cq_entries; i++) { + unsigned int entry = i + cq_head; + struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags); + } /* * Avoid ABBA deadlock between the seq lock and the io_uring mutex, @@ -10099,7 +10149,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) xa_for_each(&ctx->personalities, index, cred) io_uring_show_cred(m, index, cred); } - seq_printf(m, "PollList:\n"); + if (has_lock) + mutex_unlock(&ctx->uring_lock); + + seq_puts(m, "PollList:\n"); spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list = &ctx->cancel_hash[i]; @@ -10109,12 +10162,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, req->task->task_works != NULL); } + + seq_puts(m, "CqOverflowList:\n"); + list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { + struct io_uring_cqe *cqe = &ocqe->cqe; + + seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", + cqe->user_data, cqe->res, cqe->flags); + + } + spin_unlock(&ctx->completion_lock); - if (has_lock) - mutex_unlock(&ctx->uring_lock); } -static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) { struct io_ring_ctx *ctx = f->private_data; @@ -10138,8 +10199,8 @@ static const struct file_operations io_uring_fops = { #endif }; -static int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, + struct io_uring_params *p) { struct io_rings *rings; size_t size, sq_array_offset; @@ -10228,8 +10289,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) return file; } -static int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) { struct io_ring_ctx *ctx; struct file *file; @@ -10387,7 +10448,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } -static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) { struct io_uring_probe *p; size_t size; @@ -10443,8 +10505,8 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args) +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) { struct io_uring_restriction *res; size_t size; @@ -10578,7 +10640,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return __io_register_rsrc_update(ctx, type, &up, up.nr); } -static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, +static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; @@ -10604,8 +10666,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } -static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, - unsigned len) +static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, + void __user *arg, unsigned len) { struct io_uring_task *tctx = current->io_uring; cpumask_var_t new_mask; @@ -10631,7 +10693,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) +static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; @@ -10641,9 +10703,11 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } -static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, - void __user *arg) +static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) + __must_hold(&ctx->uring_lock) { + struct io_tctx_node *node; struct io_uring_task *tctx = NULL; struct io_sq_data *sqd = NULL; __u32 new_count[2]; @@ -10674,13 +10738,19 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, tctx = current->io_uring; } - ret = -EINVAL; - if (!tctx || !tctx->io_wq) - goto err; + BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); - ret = io_wq_max_workers(tctx->io_wq, new_count); - if (ret) - goto err; + memcpy(ctx->iowq_limits, new_count, sizeof(new_count)); + ctx->iowq_limits_set = true; + + ret = -EINVAL; + if (tctx && tctx->io_wq) { + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + goto err; + } else { + memset(new_count, 0, sizeof(new_count)); + } if (sqd) { mutex_unlock(&sqd->lock); @@ -10690,6 +10760,22 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; + /* that's it for SQPOLL, only the SQPOLL task creates requests */ + if (sqd) + return 0; + + /* now propagate the restriction to all registered users */ + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + if (WARN_ON_ONCE(!tctx->io_wq)) + continue; + + for (i = 0; i < ARRAY_SIZE(new_count); i++) + new_count[i] = ctx->iowq_limits[i]; + /* ignore errors, it always returns zero anyway */ + (void)io_wq_max_workers(tctx->io_wq, new_count); + } return 0; err: if (sqd) { @@ -10723,7 +10809,7 @@ static bool io_register_op_must_quiesce(int op) } } -static int io_ctx_quiesce(struct io_ring_ctx *ctx) +static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) { long ret; @@ -10738,10 +10824,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx) */ mutex_unlock(&ctx->uring_lock); do { - ret = wait_for_completion_interruptible(&ctx->ref_comp); - if (!ret) + ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); + if (ret) { + ret = min(0L, ret); break; + } + ret = io_run_task_work_sig(); + io_req_caches_free(ctx); } while (ret >= 0); mutex_lock(&ctx->uring_lock); @@ -10972,6 +11062,8 @@ static int __init io_uring_init(void) /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); + BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); + BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4ecd255e0511..811c898125a5 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -38,8 +38,7 @@ struct iomap_dio { struct { struct iov_iter *iter; struct task_struct *waiter; - struct request_queue *last_queue; - blk_qc_t cookie; + struct bio *poll_bio; } submit; /* used for aio completion: */ @@ -49,29 +48,20 @@ struct iomap_dio { }; }; -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) -{ - struct request_queue *q = READ_ONCE(kiocb->private); - - if (!q) - return 0; - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); -} -EXPORT_SYMBOL_GPL(iomap_dio_iopoll); - static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); - if (dio->iocb->ki_flags & IOCB_HIPRI) + if (dio->iocb->ki_flags & IOCB_HIPRI) { bio_set_polled(bio, dio->iocb); + dio->submit.poll_bio = bio; + } - dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev); if (dio->dops && dio->dops->submit_io) - dio->submit.cookie = dio->dops->submit_io(iter, bio, pos); + dio->dops->submit_io(iter, bio, pos); else - dio->submit.cookie = submit_bio(bio); + submit_bio(bio); } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -135,7 +125,7 @@ static void iomap_dio_complete_work(struct work_struct *work) struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; - iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); + iocb->ki_complete(iocb, iomap_dio_complete(dio)); } /* @@ -164,9 +154,11 @@ static void iomap_dio_bio_end_io(struct bio *bio) } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); + WRITE_ONCE(dio->iocb->private, NULL); INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } else { + WRITE_ONCE(dio->iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); } } @@ -282,6 +274,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (!iov_iter_count(dio->submit.iter)) goto out; + /* + * We can only poll for single bio I/Os. + */ + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) + dio->iocb->ki_flags &= ~IOCB_HIPRI; + if (need_zeroout) { /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); @@ -339,6 +338,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); + /* + * We can only poll for single bio I/Os. + */ + if (nr_pages) + dio->iocb->ki_flags &= ~IOCB_HIPRI; iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; } while (nr_pages); @@ -485,8 +489,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.iter = iter; dio->submit.waiter = current; - dio->submit.cookie = BLK_QC_T_NONE; - dio->submit.last_queue = NULL; + dio->submit.poll_bio = NULL; if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) @@ -565,8 +568,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, inode_dio_begin(inode); blk_start_plug(&plug); - while ((ret = iomap_iter(&iomi, ops)) > 0) + while ((ret = iomap_iter(&iomi, ops)) > 0) { iomi.processed = iomap_dio_iter(&iomi, dio); + + /* + * We can only poll for single bio I/Os. + */ + iocb->ki_flags &= ~IOCB_HIPRI; + } + blk_finish_plug(&plug); /* @@ -592,8 +602,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; - WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); - WRITE_ONCE(iocb->private, dio->submit.last_queue); + WRITE_ONCE(iocb->private, dio->submit.poll_bio); /* * We are about to drop our additional submission reference, which @@ -620,10 +629,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->submit.waiter)) break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !dio->submit.last_queue || - !blk_poll(dio->submit.last_queue, - dio->submit.cookie, true)) + if (!dio->submit.poll_bio || + !bio_poll(dio->submit.poll_bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 176580f54af9..104ae698443e 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/mempool.h> #include <linux/seq_file.h> +#include <linux/writeback.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index bde787c354fc..8b9a72ae5efa 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) goto out; } - VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; - + VolumeSize = sb_bdev_nr_blocks(sb); if (VolumeSize) { if (newLVSize > VolumeSize) { printk(KERN_WARNING "jfs_extendfs: invalid size\n"); @@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) txQuiesce(sb); /* Reset size of direct inode */ - sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode); + sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev); if (sbi->mntflag & JFS_INLINELOG) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 9030aeaf0f88..24cbc9946e01 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_resize_nosize: { - *newLVSize = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + *newLVSize = sb_bdev_nr_blocks(sb); if (*newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); break; @@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto out_unload; } - inode->i_size = i_size_read(sb->s_bdev->bd_inode); + inode->i_size = bdev_nr_bytes(sb->s_bdev); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 87aac4c72c37..1b07550485b9 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -178,7 +178,7 @@ int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, struct fd f = fdget(fd); int ret = -EBADF; - if (!f.file) + if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index cfc3ce8b815a..8e0a1378a4b1 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1111,7 +1111,14 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ - if (kn && kernfs_active(kn)) { + if (kn) { + /* Inactive nodes are invisible to the VFS so don't + * create a negative. + */ + if (!kernfs_active(kn)) { + up_read(&kernfs_rwsem); + return NULL; + } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index 71c989f1568d..30a92ddc1817 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -298,8 +298,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, int blob_len, struct ksmbd_session *sess) { char *domain_name; - unsigned int lm_off, nt_off; - unsigned short nt_len; + unsigned int nt_off, dn_off; + unsigned short nt_len, dn_len; int ret; if (blob_len < sizeof(struct authenticate_message)) { @@ -314,15 +314,17 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, return -EINVAL; } - lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset); nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset); nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length); + dn_off = le32_to_cpu(authblob->DomainName.BufferOffset); + dn_len = le16_to_cpu(authblob->DomainName.Length); + + if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len) + return -EINVAL; /* TODO : use domain name that imported from configuration file */ - domain_name = smb_strndup_from_utf16((const char *)authblob + - le32_to_cpu(authblob->DomainName.BufferOffset), - le16_to_cpu(authblob->DomainName.Length), true, - sess->conn->local_nls); + domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off, + dn_len, true, sess->conn->local_nls); if (IS_ERR(domain_name)) return PTR_ERR(domain_name); diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index af086d35398a..b57a0d8a392f 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -61,6 +61,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->local_nls = load_nls_default(); atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); + conn->total_credits = 1; + init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); INIT_LIST_HEAD(&conn->sessions); @@ -296,10 +298,12 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); - /* make sure we have enough to get to SMB header end */ - if (!ksmbd_pdu_size_has_room(pdu_size)) { - ksmbd_debug(CONN, "SMB request too short (%u bytes)\n", - pdu_size); + /* + * Check if pdu size is valid (min : smb header size, + * max : 0x00FFFFFF). + */ + if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE || + pdu_size > MAX_STREAM_PROT_LEN) { continue; } diff --git a/fs/ksmbd/glob.h b/fs/ksmbd/glob.h index 49a5a3afa118..5b8f3e0ebdb3 100644 --- a/fs/ksmbd/glob.h +++ b/fs/ksmbd/glob.h @@ -12,7 +12,7 @@ #include "unicode.h" #include "vfs_cache.h" -#define KSMBD_VERSION "3.1.9" +#define KSMBD_VERSION "3.4.2" extern int ksmbd_debug_types; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index 2fbe2bc1e093..c6718a05d347 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -211,6 +211,7 @@ struct ksmbd_tree_disconnect_request { */ struct ksmbd_logout_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u32 account_flags; }; /* @@ -317,6 +318,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_USER_FLAG_BAD_UID BIT(2) #define KSMBD_USER_FLAG_BAD_USER BIT(3) #define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) +#define KSMBD_USER_FLAG_DELAY_SESSION BIT(5) /* * Share config flags. diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c index d21629ae5c89..1019d3677d55 100644 --- a/fs/ksmbd/mgmt/user_config.c +++ b/fs/ksmbd/mgmt/user_config.c @@ -55,7 +55,7 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) void ksmbd_free_user(struct ksmbd_user *user) { - ksmbd_ipc_logout_request(user->name); + ksmbd_ipc_logout_request(user->name, user->flags); kfree(user->name); kfree(user->passkey); kfree(user); diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h index b2bb074a0150..aff80b029579 100644 --- a/fs/ksmbd/mgmt/user_config.h +++ b/fs/ksmbd/mgmt/user_config.h @@ -18,6 +18,7 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; + unsigned int failed_login_count; }; static inline bool user_guest(struct ksmbd_user *user) diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 9aa46bb3e10d..030ca57c3784 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -80,18 +80,21 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { }; /* - * Returns the pointer to the beginning of the data area. Length of the data - * area and the offset to it (from the beginning of the smb are also returned. + * Set length of the data area and the offset to arguments. + * if they are invalid, return error. */ -static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, + struct smb2_hdr *hdr) { + int ret = 0; + *off = 0; *len = 0; /* error reqeusts do not have data area */ if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED && (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE) - return NULL; + return ret; /* * Following commands have data areas so we have to get the location @@ -165,69 +168,60 @@ static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) case SMB2_IOCTL: *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset); *len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount); - break; default: ksmbd_debug(SMB, "no length check for command\n"); break; } - /* - * Invalid length or offset probably means data area is invalid, but - * we have little choice but to ignore the data area in this case. - */ if (*off > 4096) { - ksmbd_debug(SMB, "offset %d too large, data area ignored\n", - *off); - *len = 0; - *off = 0; - } else if (*off < 0) { - ksmbd_debug(SMB, - "negative offset %d to data invalid ignore data area\n", - *off); - *off = 0; - *len = 0; - } else if (*len < 0) { - ksmbd_debug(SMB, - "negative data length %d invalid, data area ignored\n", - *len); - *len = 0; - } else if (*len > 128 * 1024) { - ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len); - *len = 0; + ksmbd_debug(SMB, "offset %d too large\n", *off); + ret = -EINVAL; + } else if ((u64)*off + *len > MAX_STREAM_PROT_LEN) { + ksmbd_debug(SMB, "Request is larger than maximum stream protocol length(%u): %llu\n", + MAX_STREAM_PROT_LEN, (u64)*off + *len); + ret = -EINVAL; } - /* return pointer to beginning of data area, ie offset from SMB start */ - if ((*off != 0) && (*len != 0)) - return (char *)hdr + *off; - else - return NULL; + return ret; } /* * Calculate the size of the SMB message based on the fixed header * portion, the number of word parameters and the data portion of the message. */ -static unsigned int smb2_calc_size(void *buf) +static int smb2_calc_size(void *buf, unsigned int *len) { struct smb2_pdu *pdu = (struct smb2_pdu *)buf; struct smb2_hdr *hdr = &pdu->hdr; - int offset; /* the offset from the beginning of SMB to data area */ - int data_length; /* the length of the variable length data area */ + unsigned int offset; /* the offset from the beginning of SMB to data area */ + unsigned int data_length; /* the length of the variable length data area */ + int ret; + /* Structure Size has already been checked to make sure it is 64 */ - int len = le16_to_cpu(hdr->StructureSize); + *len = le16_to_cpu(hdr->StructureSize); /* * StructureSize2, ie length of fixed parameter area has already * been checked to make sure it is the correct length. */ - len += le16_to_cpu(pdu->StructureSize2); + *len += le16_to_cpu(pdu->StructureSize2); + /* + * StructureSize2 of smb2_lock pdu is set to 48, indicating + * the size of smb2 lock request with single smb2_lock_element + * regardless of number of locks. Subtract single + * smb2_lock_element for correct buffer size check. + */ + if (hdr->Command == SMB2_LOCK) + *len -= sizeof(struct smb2_lock_element); if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false) goto calc_size_exit; - smb2_get_data_area_len(&offset, &data_length, hdr); - ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length, + ret = smb2_get_data_area_len(&offset, &data_length, hdr); + if (ret) + return ret; + ksmbd_debug(SMB, "SMB2 data length %u offset %u\n", data_length, offset); if (data_length > 0) { @@ -237,16 +231,19 @@ static unsigned int smb2_calc_size(void *buf) * for some commands, typically those with odd StructureSize, * so we must add one to the calculation. */ - if (offset + 1 < len) + if (offset + 1 < *len) { ksmbd_debug(SMB, - "data area offset %d overlaps SMB2 header %d\n", - offset + 1, len); - else - len = offset + data_length; + "data area offset %d overlaps SMB2 header %u\n", + offset + 1, *len); + return -EINVAL; + } + + *len = offset + data_length; } + calc_size_exit: - ksmbd_debug(SMB, "SMB2 len %d\n", len); - return len; + ksmbd_debug(SMB, "SMB2 len %u\n", *len); + return 0; } static inline int smb2_query_info_req_len(struct smb2_query_info_req *h) @@ -287,11 +284,13 @@ static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h) le32_to_cpu(h->MaxOutputResponse); } -static int smb2_validate_credit_charge(struct smb2_hdr *hdr) +static int smb2_validate_credit_charge(struct ksmbd_conn *conn, + struct smb2_hdr *hdr) { - int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; - int credit_charge = le16_to_cpu(hdr->CreditCharge); + unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; + unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge); void *__hdr = hdr; + int ret; switch (hdr->Command) { case SMB2_QUERY_INFO: @@ -313,21 +312,37 @@ static int smb2_validate_credit_charge(struct smb2_hdr *hdr) req_len = smb2_ioctl_req_len(__hdr); expect_resp_len = smb2_ioctl_resp_len(__hdr); break; - default: + case SMB2_CANCEL: return 0; + default: + req_len = 1; + break; } - credit_charge = max(1, credit_charge); - max_len = max(req_len, expect_resp_len); + credit_charge = max_t(unsigned short, credit_charge, 1); + max_len = max_t(unsigned int, req_len, expect_resp_len); calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE); if (credit_charge < calc_credit_num) { - pr_err("Insufficient credit charge, given: %d, needed: %d\n", - credit_charge, calc_credit_num); + ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n", + credit_charge, calc_credit_num); + return 1; + } else if (credit_charge > conn->max_credits) { + ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge); return 1; } - return 0; + spin_lock(&conn->credits_lock); + if (credit_charge <= conn->total_credits) { + conn->total_credits -= credit_charge; + ret = 0; + } else { + ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", + credit_charge, conn->total_credits); + ret = 1; + } + spin_unlock(&conn->credits_lock); + return ret; } int ksmbd_smb2_check_message(struct ksmbd_work *work) @@ -385,24 +400,20 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } } - if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && - smb2_validate_credit_charge(hdr)) { - work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + if (smb2_calc_size(hdr, &clc_len)) return 1; - } - clc_len = smb2_calc_size(hdr); if (len != clc_len) { - /* server can return one byte more due to implied bcc[0] */ + /* client can return one byte more due to implied bcc[0] */ if (clc_len == len + 1) - return 0; + goto validate_credit; /* * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ if (ALIGN(clc_len, 8) == len) - return 0; + goto validate_credit; /* * windows client also pad up to 8 bytes when compounding. @@ -415,12 +426,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", len, clc_len, command, le64_to_cpu(hdr->MessageId)); - return 0; + goto validate_credit; } - if (command == SMB2_LOCK_HE && len == 88) - return 0; - ksmbd_debug(SMB, "cli req too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, @@ -429,6 +437,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) return 1; } +validate_credit: + if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && + smb2_validate_credit_charge(work->conn, hdr)) { + work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + return 1; + } + return 0; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 197473871aa4..fb6a65d23139 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -187,11 +187,6 @@ static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = { [SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify}, }; -int init_smb2_0_server(struct ksmbd_conn *conn) -{ - return -EOPNOTSUPP; -} - /** * init_smb2_1_server() - initialize a smb server connection with smb2.1 * command dispatcher @@ -289,6 +284,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn) void init_smb2_max_read_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_read_size = sz; smb30_server_values.max_read_size = sz; smb302_server_values.max_read_size = sz; @@ -297,6 +293,7 @@ void init_smb2_max_read_size(unsigned int sz) void init_smb2_max_write_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_write_size = sz; smb30_server_values.max_write_size = sz; smb302_server_values.max_write_size = sz; @@ -305,6 +302,7 @@ void init_smb2_max_write_size(unsigned int sz) void init_smb2_max_trans_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_trans_size = sz; smb30_server_values.max_trans_size = sz; smb302_server_values.max_trans_size = sz; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index dcf907738610..7e448df3f847 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -236,9 +236,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) if (conn->need_neg == false) return -EINVAL; - if (!(conn->dialect >= SMB20_PROT_ID && - conn->dialect <= SMB311_PROT_ID)) - return -EINVAL; rsp_hdr = work->response_buf; @@ -295,22 +292,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) return 0; } -static int smb2_consume_credit_charge(struct ksmbd_work *work, - unsigned short credit_charge) -{ - struct ksmbd_conn *conn = work->conn; - unsigned int rsp_credits = 1; - - if (!conn->total_credits) - return 0; - - if (credit_charge > 0) - rsp_credits = credit_charge; - - conn->total_credits -= rsp_credits; - return rsp_credits; -} - /** * smb2_set_rsp_credits() - set number of credits in response buffer * @work: smb work containing smb response buffer @@ -320,49 +301,43 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest); - unsigned short credit_charge = 1, credits_granted = 0; - unsigned short aux_max, aux_credits, min_credits; - int rsp_credit_charge; + unsigned short credits_requested; + unsigned short credit_charge, credits_granted = 0; + unsigned short aux_max, aux_credits; - if (hdr->Command == SMB2_CANCEL) - goto out; + if (work->send_no_response) + return 0; - /* get default minimum credits by shifting maximum credits by 4 */ - min_credits = conn->max_credits >> 4; + hdr->CreditCharge = req_hdr->CreditCharge; - if (conn->total_credits >= conn->max_credits) { + if (conn->total_credits > conn->max_credits) { + hdr->CreditRequest = 0; pr_err("Total credits overflow: %d\n", conn->total_credits); - conn->total_credits = min_credits; + return -EINVAL; } - rsp_credit_charge = - smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge)); - if (rsp_credit_charge < 0) - return -EINVAL; + credit_charge = max_t(unsigned short, + le16_to_cpu(req_hdr->CreditCharge), 1); + credits_requested = max_t(unsigned short, + le16_to_cpu(req_hdr->CreditRequest), 1); - hdr->CreditCharge = cpu_to_le16(rsp_credit_charge); + /* according to smb2.credits smbtorture, Windows server + * 2016 or later grant up to 8192 credits at once. + * + * TODO: Need to adjuct CreditRequest value according to + * current cpu load + */ + aux_credits = credits_requested - 1; + if (hdr->Command == SMB2_NEGOTIATE) + aux_max = 0; + else + aux_max = conn->max_credits - credit_charge; + aux_credits = min_t(unsigned short, aux_credits, aux_max); + credits_granted = credit_charge + aux_credits; - if (credits_requested > 0) { - aux_credits = credits_requested - 1; - aux_max = 32; - if (hdr->Command == SMB2_NEGOTIATE) - aux_max = 0; - aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max; - credits_granted = aux_credits + credit_charge; - - /* if credits granted per client is getting bigger than default - * minimum credits then we should wrap it up within the limits. - */ - if ((conn->total_credits + credits_granted) > min_credits) - credits_granted = min_credits - conn->total_credits; - /* - * TODO: Need to adjuct CreditRequest value according to - * current cpu load - */ - } else if (conn->total_credits == 0) { - credits_granted = 1; - } + if (conn->max_credits - conn->total_credits < credits_granted) + credits_granted = conn->max_credits - + conn->total_credits; conn->total_credits += credits_granted; work->credits_granted += credits_granted; @@ -371,7 +346,6 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) /* Update CreditRequest in last request */ hdr->CreditRequest = cpu_to_le16(work->credits_granted); } -out: ksmbd_debug(SMB, "credits: requested[%d] granted[%d] total_granted[%d]\n", credits_requested, credits_granted, @@ -475,6 +449,12 @@ bool is_chained_smb2_message(struct ksmbd_work *work) return false; } + if ((u64)get_rfc1002_len(work->response_buf) + MAX_CIFS_SMALL_BUFFER_SIZE > + work->response_sz) { + pr_err("next response offset exceeds response buffer size\n"); + return false; + } + ksmbd_debug(SMB, "got SMB2 chained command\n"); init_chained_smb2_rsp(work); return true; @@ -544,7 +524,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work) { struct smb2_hdr *hdr = work->request_buf; size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE; - size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE; + size_t large_sz = small_sz + work->conn->vals->max_trans_size; size_t sz = small_sz; int cmd = le16_to_cpu(hdr->Command); @@ -1166,13 +1146,6 @@ int smb2_handle_negotiate(struct ksmbd_work *work) case SMB21_PROT_ID: init_smb2_1_server(conn); break; - case SMB20_PROT_ID: - rc = init_smb2_0_server(conn); - if (rc) { - rsp->hdr.Status = STATUS_NOT_SUPPORTED; - goto err_out; - } - break; case SMB2X_PROT_ID: case BAD_PROT_ID: default: @@ -1191,11 +1164,9 @@ int smb2_handle_negotiate(struct ksmbd_work *work) rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size); rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size); - if (conn->dialect > SMB20_PROT_ID) { - memcpy(conn->ClientGUID, req->ClientGUID, - SMB2_CLIENT_GUID_SIZE); - conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); - } + memcpy(conn->ClientGUID, req->ClientGUID, + SMB2_CLIENT_GUID_SIZE); + conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); rsp->StructureSize = cpu_to_le16(65); rsp->DialectRevision = cpu_to_le16(conn->dialect); @@ -1286,19 +1257,13 @@ static int generate_preauth_hash(struct ksmbd_work *work) return 0; } -static int decode_negotiation_token(struct ksmbd_work *work, - struct negotiate_message *negblob) +static int decode_negotiation_token(struct ksmbd_conn *conn, + struct negotiate_message *negblob, + size_t sz) { - struct ksmbd_conn *conn = work->conn; - struct smb2_sess_setup_req *req; - int sz; - if (!conn->use_spnego) return -EINVAL; - req = work->request_buf; - sz = le16_to_cpu(req->SecurityBufferLength); - if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) { if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) { conn->auth_mechs |= KSMBD_AUTH_NTLMSSP; @@ -1310,9 +1275,9 @@ static int decode_negotiation_token(struct ksmbd_work *work, } static int ntlm_negotiate(struct ksmbd_work *work, - struct negotiate_message *negblob) + struct negotiate_message *negblob, + size_t negblob_len) { - struct smb2_sess_setup_req *req = work->request_buf; struct smb2_sess_setup_rsp *rsp = work->response_buf; struct challenge_message *chgblob; unsigned char *spnego_blob = NULL; @@ -1321,8 +1286,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, int sz, rc; ksmbd_debug(SMB, "negotiate phase\n"); - sz = le16_to_cpu(req->SecurityBufferLength); - rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess); if (rc) return rc; @@ -1390,12 +1354,23 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, struct authenticate_message *authblob; struct ksmbd_user *user; char *name; - int sz; + unsigned int auth_msg_len, name_off, name_len, secbuf_len; + secbuf_len = le16_to_cpu(req->SecurityBufferLength); + if (secbuf_len < sizeof(struct authenticate_message)) { + ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len); + return NULL; + } authblob = user_authblob(conn, req); - sz = le32_to_cpu(authblob->UserName.BufferOffset); - name = smb_strndup_from_utf16((const char *)authblob + sz, - le16_to_cpu(authblob->UserName.Length), + name_off = le32_to_cpu(authblob->UserName.BufferOffset); + name_len = le16_to_cpu(authblob->UserName.Length); + auth_msg_len = le16_to_cpu(req->SecurityBufferOffset) + secbuf_len; + + if (auth_msg_len < (u64)name_off + name_len) + return NULL; + + name = smb_strndup_from_utf16((const char *)authblob + name_off, + name_len, true, conn->local_nls); if (IS_ERR(name)) { @@ -1537,11 +1512,9 @@ binding_session: } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -1623,11 +1596,9 @@ static int krb5_authenticate(struct ksmbd_work *work) } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -1645,6 +1616,7 @@ int smb2_sess_setup(struct ksmbd_work *work) struct smb2_sess_setup_rsp *rsp = work->response_buf; struct ksmbd_session *sess; struct negotiate_message *negblob; + unsigned int negblob_len, negblob_off; int rc = 0; ksmbd_debug(SMB, "Received request for session setup\n"); @@ -1725,10 +1697,16 @@ int smb2_sess_setup(struct ksmbd_work *work) if (sess->state == SMB2_SESSION_EXPIRED) sess->state = SMB2_SESSION_IN_PROGRESS; + negblob_off = le16_to_cpu(req->SecurityBufferOffset); + negblob_len = le16_to_cpu(req->SecurityBufferLength); + if (negblob_off < (offsetof(struct smb2_sess_setup_req, Buffer) - 4) || + negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) + return -EINVAL; + negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId + - le16_to_cpu(req->SecurityBufferOffset)); + negblob_off); - if (decode_negotiation_token(work, negblob) == 0) { + if (decode_negotiation_token(conn, negblob, negblob_len) == 0) { if (conn->mechToken) negblob = (struct negotiate_message *)conn->mechToken; } @@ -1752,7 +1730,7 @@ int smb2_sess_setup(struct ksmbd_work *work) sess->Preauth_HashValue = NULL; } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { if (negblob->MessageType == NtLmNegotiate) { - rc = ntlm_negotiate(work, negblob); + rc = ntlm_negotiate(work, negblob, negblob_len); if (rc) goto out_err; rsp->hdr.Status = @@ -1812,9 +1790,30 @@ out_err: conn->mechToken = NULL; } - if (rc < 0 && sess) { - ksmbd_session_destroy(sess); - work->sess = NULL; + if (rc < 0) { + /* + * SecurityBufferOffset should be set to zero + * in session setup error response. + */ + rsp->SecurityBufferOffset = 0; + + if (sess) { + bool try_delay = false; + + /* + * To avoid dictionary attacks (repeated session setups rapidly sent) to + * connect to server, ksmbd make a delay of a 5 seconds on session setup + * failure to make it harder to send enough random connection requests + * to break into a server. + */ + if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION) + try_delay = true; + + ksmbd_session_destroy(sess); + work->sess = NULL; + if (try_delay) + ssleep(5); + } } return rc; @@ -3795,6 +3794,24 @@ static int verify_info_level(int info_level) return 0; } +static int smb2_calc_max_out_buf_len(struct ksmbd_work *work, + unsigned short hdr2_len, + unsigned int out_buf_len) +{ + int free_len; + + if (out_buf_len > work->conn->vals->max_trans_size) + return -EINVAL; + + free_len = (int)(work->response_sz - + (get_rfc1002_len(work->response_buf) + 4)) - + hdr2_len; + if (free_len < 0) + return -EINVAL; + + return min_t(int, out_buf_len, free_len); +} + int smb2_query_dir(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; @@ -3871,9 +3888,13 @@ int smb2_query_dir(struct ksmbd_work *work) memset(&d_info, 0, sizeof(struct ksmbd_dir_info)); d_info.wptr = (char *)rsp->Buffer; d_info.rptr = (char *)rsp->Buffer; - d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4)); - d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) - - sizeof(struct smb2_query_directory_rsp); + d_info.out_buf_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (d_info.out_buf_len < 0) { + rc = -EINVAL; + goto err_out; + } d_info.flags = srch_flag; /* @@ -4107,12 +4128,11 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(req->Flags)); } - buf_free_len = work->response_sz - - (get_rfc1002_len(rsp_org) + 4) - - sizeof(struct smb2_query_info_rsp); - - if (le32_to_cpu(req->OutputBufferLength) < buf_free_len) - buf_free_len = le32_to_cpu(req->OutputBufferLength); + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + return -EINVAL; rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (rc < 0) { @@ -4423,6 +4443,8 @@ static void get_file_stream_info(struct ksmbd_work *work, struct path *path = &fp->filp->f_path; ssize_t xattr_list_len; int nbytes = 0, streamlen, stream_name_len, next, idx = 0; + int buf_free_len; + struct smb2_query_info_req *req = ksmbd_req_buf_next(work); generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), &stat); @@ -4436,6 +4458,12 @@ static void get_file_stream_info(struct ksmbd_work *work, goto out; } + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + goto out; + while (idx < xattr_list_len) { stream_name = xattr_list + idx; streamlen = strlen(stream_name); @@ -4460,6 +4488,10 @@ static void get_file_stream_info(struct ksmbd_work *work, streamlen = snprintf(stream_buf, streamlen + 1, ":%s", &stream_name[XATTR_NAME_STREAM_LEN]); + next = sizeof(struct smb2_file_stream_info) + streamlen * 2; + if (next > buf_free_len) + break; + file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, stream_buf, streamlen, @@ -4470,12 +4502,13 @@ static void get_file_stream_info(struct ksmbd_work *work, file_info->StreamSize = cpu_to_le64(stream_name_len); file_info->StreamAllocationSize = cpu_to_le64(stream_name_len); - next = sizeof(struct smb2_file_stream_info) + streamlen; nbytes += next; + buf_free_len -= next; file_info->NextEntryOffset = cpu_to_le32(next); } - if (!S_ISDIR(stat.mode)) { + if (!S_ISDIR(stat.mode) && + buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) { file_info = (struct smb2_file_stream_info *) &rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, @@ -5499,7 +5532,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, struct ksmbd_share_config *share) { struct iattr attrs; - struct timespec64 ctime; struct file *filp; struct inode *inode; struct user_namespace *user_ns; @@ -5521,13 +5553,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); } - if (file_info->ChangeTime) { + attrs.ia_valid |= ATTR_CTIME; + if (file_info->ChangeTime) attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - ctime = attrs.ia_ctime; - attrs.ia_valid |= ATTR_CTIME; - } else { - ctime = inode->i_ctime; - } + else + attrs.ia_ctime = inode->i_ctime; if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); @@ -5573,11 +5603,9 @@ static int set_file_basic_info(struct ksmbd_file *fp, return -EACCES; inode_lock(inode); + inode->i_ctime = attrs.ia_ctime; + attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(user_ns, dentry, &attrs, NULL); - if (!rc) { - inode->i_ctime = ctime; - mark_inode_dirty(inode); - } inode_unlock(inode); } return rc; @@ -6241,8 +6269,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) (offsetof(struct smb2_write_req, Buffer) - 4)) { data_buf = (char *)&req->Buffer[0]; } else { - if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || - (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) { pr_err("invalid write data offset %u, smb_len %u\n", le16_to_cpu(req->DataOffset), get_rfc1002_len(req)); @@ -6400,8 +6427,7 @@ int smb2_write(struct ksmbd_work *work) (offsetof(struct smb2_write_req, Buffer) - 4)) { data_buf = (char *)&req->Buffer[0]; } else { - if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || - (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) { pr_err("invalid write data offset %u, smb_len %u\n", le16_to_cpu(req->DataOffset), get_rfc1002_len(req)); @@ -7044,24 +7070,26 @@ out2: return err; } -static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, +static int fsctl_copychunk(struct ksmbd_work *work, + struct copychunk_ioctl_req *ci_req, + unsigned int cnt_code, + unsigned int input_count, + unsigned long long volatile_id, + unsigned long long persistent_id, struct smb2_ioctl_rsp *rsp) { - struct copychunk_ioctl_req *ci_req; struct copychunk_ioctl_rsp *ci_rsp; struct ksmbd_file *src_fp = NULL, *dst_fp = NULL; struct srv_copychunk *chunks; unsigned int i, chunk_count, chunk_count_written = 0; unsigned int chunk_size_written = 0; loff_t total_size_written = 0; - int ret, cnt_code; + int ret = 0; - cnt_code = le32_to_cpu(req->CntCode); - ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0]; ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0]; - rsp->VolatileFileId = req->VolatileFileId; - rsp->PersistentFileId = req->PersistentFileId; + rsp->VolatileFileId = cpu_to_le64(volatile_id); + rsp->PersistentFileId = cpu_to_le64(persistent_id); ci_rsp->ChunksWritten = cpu_to_le32(ksmbd_server_side_copy_max_chunk_count()); ci_rsp->ChunkBytesWritten = @@ -7071,12 +7099,13 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, chunks = (struct srv_copychunk *)&ci_req->Chunks[0]; chunk_count = le32_to_cpu(ci_req->ChunkCount); + if (chunk_count == 0) + goto out; total_size_written = 0; /* verify the SRV_COPYCHUNK_COPY packet */ if (chunk_count > ksmbd_server_side_copy_max_chunk_count() || - le32_to_cpu(req->InputCount) < - offsetof(struct copychunk_ioctl_req, Chunks) + + input_count < offsetof(struct copychunk_ioctl_req, Chunks) + chunk_count * sizeof(struct srv_copychunk)) { rsp->hdr.Status = STATUS_INVALID_PARAMETER; return -EINVAL; @@ -7097,9 +7126,7 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, src_fp = ksmbd_lookup_foreign_fd(work, le64_to_cpu(ci_req->ResumeKey[0])); - dst_fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + dst_fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id); ret = -EINVAL; if (!src_fp || src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) { @@ -7174,8 +7201,8 @@ static __be32 idev_ipv4_address(struct in_device *idev) } static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, - struct smb2_ioctl_req *req, - struct smb2_ioctl_rsp *rsp) + struct smb2_ioctl_rsp *rsp, + unsigned int out_buf_len) { struct network_interface_info_ioctl_rsp *nii_rsp = NULL; int nbytes = 0; @@ -7187,6 +7214,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, rtnl_lock(); for_each_netdev(&init_net, netdev) { + if (out_buf_len < + nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { + rtnl_unlock(); + return -ENOSPC; + } + if (netdev->type == ARPHRD_LOOPBACK) continue; @@ -7266,11 +7299,6 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, if (nii_rsp) nii_rsp->Next = 0; - if (!nbytes) { - rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL; - return -EINVAL; - } - rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); return nbytes; @@ -7278,11 +7306,16 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn, struct validate_negotiate_info_req *neg_req, - struct validate_negotiate_info_rsp *neg_rsp) + struct validate_negotiate_info_rsp *neg_rsp, + unsigned int in_buf_len) { int ret = 0; int dialect; + if (in_buf_len < sizeof(struct validate_negotiate_info_req) + + le16_to_cpu(neg_req->DialectCount) * sizeof(__le16)) + return -EINVAL; + dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects, neg_req->DialectCount); if (dialect == BAD_PROT_ID || dialect != conn->dialect) { @@ -7316,7 +7349,7 @@ err_out: static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id, struct file_allocated_range_buffer *qar_req, struct file_allocated_range_buffer *qar_rsp, - int in_count, int *out_count) + unsigned int in_count, unsigned int *out_count) { struct ksmbd_file *fp; loff_t start, length; @@ -7343,7 +7376,8 @@ static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id, } static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id, - int out_buf_len, struct smb2_ioctl_req *req, + unsigned int out_buf_len, + struct smb2_ioctl_req *req, struct smb2_ioctl_rsp *rsp) { struct ksmbd_rpc_command *rpc_resp; @@ -7457,8 +7491,7 @@ int smb2_ioctl(struct ksmbd_work *work) { struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp, *rsp_org; - int cnt_code, nbytes = 0; - int out_buf_len; + unsigned int cnt_code, nbytes = 0, out_buf_len, in_buf_len; u64 id = KSMBD_NO_FID; struct ksmbd_conn *conn = work->conn; int ret = 0; @@ -7486,8 +7519,14 @@ int smb2_ioctl(struct ksmbd_work *work) } cnt_code = le32_to_cpu(req->CntCode); - out_buf_len = le32_to_cpu(req->MaxOutputResponse); - out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len); + ret = smb2_calc_max_out_buf_len(work, 48, + le32_to_cpu(req->MaxOutputResponse)); + if (ret < 0) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + goto out; + } + out_buf_len = (unsigned int)ret; + in_buf_len = le32_to_cpu(req->InputCount); switch (cnt_code) { case FSCTL_DFS_GET_REFERRALS: @@ -7515,6 +7554,7 @@ int smb2_ioctl(struct ksmbd_work *work) break; } case FSCTL_PIPE_TRANSCEIVE: + out_buf_len = min_t(u32, KSMBD_IPC_MAX_PAYLOAD, out_buf_len); nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp); break; case FSCTL_VALIDATE_NEGOTIATE_INFO: @@ -7523,9 +7563,16 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct validate_negotiate_info_req)) + return -EINVAL; + + if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) + return -EINVAL; + ret = fsctl_validate_negotiate_info(conn, (struct validate_negotiate_info_req *)&req->Buffer[0], - (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]); + (struct validate_negotiate_info_rsp *)&rsp->Buffer[0], + in_buf_len); if (ret < 0) goto out; @@ -7534,9 +7581,10 @@ int smb2_ioctl(struct ksmbd_work *work) rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); break; case FSCTL_QUERY_NETWORK_INTERFACE_INFO: - nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp); - if (nbytes < 0) + ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len); + if (ret < 0) goto out; + nbytes = ret; break; case FSCTL_REQUEST_RESUME_KEY: if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) { @@ -7561,15 +7609,33 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct copychunk_ioctl_req)) { + ret = -EINVAL; + goto out; + } + if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) { ret = -EINVAL; goto out; } nbytes = sizeof(struct copychunk_ioctl_rsp); - fsctl_copychunk(work, req, rsp); + rsp->VolatileFileId = req->VolatileFileId; + rsp->PersistentFileId = req->PersistentFileId; + fsctl_copychunk(work, + (struct copychunk_ioctl_req *)&req->Buffer[0], + le32_to_cpu(req->CntCode), + le32_to_cpu(req->InputCount), + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId), + rsp); break; case FSCTL_SET_SPARSE: + if (in_buf_len < sizeof(struct file_sparse)) { + ret = -EINVAL; + goto out; + } + ret = fsctl_set_sparse(work, id, (struct file_sparse *)&req->Buffer[0]); if (ret < 0) @@ -7588,6 +7654,11 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct file_zero_data_information)) { + ret = -EINVAL; + goto out; + } + zero_data = (struct file_zero_data_information *)&req->Buffer[0]; @@ -7607,6 +7678,11 @@ int smb2_ioctl(struct ksmbd_work *work) break; } case FSCTL_QUERY_ALLOCATED_RANGES: + if (in_buf_len < sizeof(struct file_allocated_range_buffer)) { + ret = -EINVAL; + goto out; + } + ret = fsctl_query_allocated_ranges(work, id, (struct file_allocated_range_buffer *)&req->Buffer[0], (struct file_allocated_range_buffer *)&rsp->Buffer[0], @@ -7647,6 +7723,11 @@ int smb2_ioctl(struct ksmbd_work *work) struct duplicate_extents_to_file *dup_ext; loff_t src_off, dst_off, length, cloned; + if (in_buf_len < sizeof(struct duplicate_extents_to_file)) { + ret = -EINVAL; + goto out; + } + dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0]; fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle, @@ -7717,6 +7798,8 @@ out: rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND; else if (ret == -EOPNOTSUPP) rsp->hdr.Status = STATUS_NOT_SUPPORTED; + else if (ret == -ENOSPC) + rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL; else if (ret < 0 || rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_INVALID_PARAMETER; smb2_set_err_rsp(work); @@ -8411,20 +8494,18 @@ int smb3_decrypt_req(struct ksmbd_work *work) struct smb2_hdr *hdr; unsigned int pdu_length = get_rfc1002_len(buf); struct kvec iov[2]; - unsigned int buf_data_size = pdu_length + 4 - + int buf_data_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr); struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; int rc = 0; - if (pdu_length + 4 < - sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) { + if (buf_data_size < sizeof(struct smb2_hdr)) { pr_err("Transform message is too small (%u)\n", pdu_length); return -ECONNABORTED; } - if (pdu_length + 4 < - le32_to_cpu(tr_hdr->OriginalMessageSize) + sizeof(struct smb2_transform_hdr)) { + if (buf_data_size < le32_to_cpu(tr_hdr->OriginalMessageSize)) { pr_err("Transform message is broken\n"); return -ECONNABORTED; } diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 261825d06391..ff5a2f01d34a 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -113,6 +113,8 @@ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) +#define SMB3_MIN_IOSIZE (64 * 1024) +#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) /* * SMB2 Header Definition @@ -1637,7 +1639,6 @@ struct smb2_posix_info { } __packed; /* functions */ -int init_smb2_0_server(struct ksmbd_conn *conn); void init_smb2_1_server(struct ksmbd_conn *conn); void init_smb3_0_server(struct ksmbd_conn *conn); void init_smb3_02_server(struct ksmbd_conn *conn); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index db8042a173d0..707490ab1f4c 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -21,7 +21,6 @@ static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%"; #define MAGIC_CHAR '~' #define PERIOD '.' #define mangle(V) ((char)(basechars[(V) % MANGLE_BASE])) -#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr)) struct smb_protocol { int index; @@ -89,7 +88,7 @@ unsigned int ksmbd_server_side_copy_max_total_size(void) inline int ksmbd_min_protocol(void) { - return SMB2_PROT; + return SMB21_PROT; } inline int ksmbd_max_protocol(void) @@ -294,11 +293,6 @@ int ksmbd_init_smb_server(struct ksmbd_work *work) return 0; } -bool ksmbd_pdu_size_has_room(unsigned int pdu) -{ - return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4); -} - int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, struct ksmbd_file *dir, struct ksmbd_dir_info *d_info, @@ -433,7 +427,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname, static int __smb2_negotiate(struct ksmbd_conn *conn) { - return (conn->dialect >= SMB20_PROT_ID && + return (conn->dialect >= SMB21_PROT_ID && conn->dialect <= SMB311_PROT_ID); } @@ -463,7 +457,7 @@ int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command) } } - if (command == SMB2_NEGOTIATE_HE) { + if (command == SMB2_NEGOTIATE_HE && __smb2_negotiate(conn)) { ret = smb2_handle_negotiate(work); init_smb2_neg_rsp(work); return ret; diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 994abede27e9..6e79e7577f6b 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -48,6 +48,8 @@ #define CIFS_DEFAULT_IOSIZE (64 * 1024) #define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ +#define MAX_STREAM_PROT_LEN 0x00FFFFFF + /* Responses when opening a file. */ #define F_SUPERSEDED 0 #define F_OPENED 1 @@ -493,8 +495,6 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); int ksmbd_init_smb_server(struct ksmbd_work *work); -bool ksmbd_pdu_size_has_room(unsigned int pdu); - struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 44aea33a67fa..1acf1892a466 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -601,7 +601,7 @@ int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, return ret; } -int ksmbd_ipc_logout_request(const char *account) +int ksmbd_ipc_logout_request(const char *account, int flags) { struct ksmbd_ipc_msg *msg; struct ksmbd_logout_request *req; @@ -616,6 +616,7 @@ int ksmbd_ipc_logout_request(const char *account) msg->type = KSMBD_EVENT_LOGOUT_REQUEST; req = (struct ksmbd_logout_request *)msg->payload; + req->account_flags = flags; strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); ret = ipc_msg_send(msg); diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h index 9eacc895ffdb..5e5b90a0c187 100644 --- a/fs/ksmbd/transport_ipc.h +++ b/fs/ksmbd/transport_ipc.h @@ -25,7 +25,7 @@ ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess, struct sockaddr *peer_addr); int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, unsigned long long connect_id); -int ksmbd_ipc_logout_request(const char *account); +int ksmbd_ipc_logout_request(const char *account, int flags); struct ksmbd_share_config_response * ksmbd_ipc_share_config_request(const char *name); struct ksmbd_spnego_authen_response * diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 3a7fa23ba850..a2fd5a4d4cd5 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -549,6 +549,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) switch (recvmsg->type) { case SMB_DIRECT_MSG_NEGOTIATE_REQ: + if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) { + put_empty_recvmsg(t, recvmsg); + return; + } t->negotiation_requested = true; t->full_packet_received = true; wake_up_interruptible(&t->wait_status); @@ -556,10 +560,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - int data_length = le32_to_cpu(data_transfer->data_length); + unsigned int data_length; int avail_recvmsg_count, receive_credits; + if (wc->byte_len < + offsetof(struct smb_direct_data_transfer, padding)) { + put_empty_recvmsg(t, recvmsg); + return; + } + + data_length = le32_to_cpu(data_transfer->data_length); if (data_length) { + if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + + (u64)data_length) { + put_empty_recvmsg(t, recvmsg); + return; + } + if (t->full_packet_received) recvmsg->first_segment = true; @@ -568,7 +585,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) else t->full_packet_received = true; - enqueue_reassembly(t, recvmsg, data_length); + enqueue_reassembly(t, recvmsg, (int)data_length); wake_up_interruptible(&t->wait_reassembly_queue); spin_lock(&t->receive_credit_lock); diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index b41954294d38..835b384b0895 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -1023,7 +1023,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, - int in_count, int *out_count) + unsigned int in_count, unsigned int *out_count) { struct file *f = fp->filp; struct inode *inode = file_inode(fp->filp); diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 7b1dcaa3fbdc..b0d5b8feb4a3 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -166,7 +166,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, - int in_count, int *out_count); + unsigned int in_count, unsigned int *out_count); int ksmbd_vfs_unlink(struct user_namespace *user_ns, struct dentry *dir, struct dentry *dentry); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); diff --git a/fs/locks.c b/fs/locks.c index 3d6fb4ae847b..0fca9d680978 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2,117 +2,11 @@ /* * linux/fs/locks.c * - * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls. - * Doug Evans (dje@spiff.uucp), August 07, 1992 + * We implement four types of file locks: BSD locks, posix locks, open + * file description locks, and leases. For details about BSD locks, + * see the flock(2) man page; for details about the other three, see + * fcntl(2). * - * Deadlock detection added. - * FIXME: one thing isn't handled yet: - * - mandatory locks (requires lots of changes elsewhere) - * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994. - * - * Miscellaneous edits, and a total rewrite of posix_lock_file() code. - * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994 - * - * Converted file_lock_table to a linked list from an array, which eliminates - * the limits on how many active file locks are open. - * Chad Page (pageone@netcom.com), November 27, 1994 - * - * Removed dependency on file descriptors. dup()'ed file descriptors now - * get the same locks as the original file descriptors, and a close() on - * any file descriptor removes ALL the locks on the file for the current - * process. Since locks still depend on the process id, locks are inherited - * after an exec() but not after a fork(). This agrees with POSIX, and both - * BSD and SVR4 practice. - * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995 - * - * Scrapped free list which is redundant now that we allocate locks - * dynamically with kmalloc()/kfree(). - * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995 - * - * Implemented two lock personalities - FL_FLOCK and FL_POSIX. - * - * FL_POSIX locks are created with calls to fcntl() and lockf() through the - * fcntl() system call. They have the semantics described above. - * - * FL_FLOCK locks are created with calls to flock(), through the flock() - * system call, which is new. Old C libraries implement flock() via fcntl() - * and will continue to use the old, broken implementation. - * - * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated - * with a file pointer (filp). As a result they can be shared by a parent - * process and its children after a fork(). They are removed when the last - * file descriptor referring to the file pointer is closed (unless explicitly - * unlocked). - * - * FL_FLOCK locks never deadlock, an existing lock is always removed before - * upgrading from shared to exclusive (or vice versa). When this happens - * any processes blocked by the current lock are woken up and allowed to - * run before the new lock is applied. - * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995 - * - * Removed some race conditions in flock_lock_file(), marked other possible - * races. Just grep for FIXME to see them. - * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996. - * - * Addressed Dmitry's concerns. Deadlock checking no longer recursive. - * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep - * once we've checked for blocking and deadlocking. - * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996. - * - * Initial implementation of mandatory locks. SunOS turned out to be - * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/filesystems/mandatory-locking.rst' for details. - * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. - * - * Don't allow mandatory locks on mmap()'ed files. Added simple functions to - * check if a file has mandatory locks, used by mmap(), open() and creat() to - * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference - * Manual, Section 2. - * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996. - * - * Tidied up block list handling. Added '/proc/locks' interface. - * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996. - * - * Fixed deadlock condition for pathological code that mixes calls to - * flock() and fcntl(). - * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996. - * - * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use - * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to - * guarantee sensible behaviour in the case where file system modules might - * be compiled with different options than the kernel itself. - * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. - * - * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel - * (Thomas.Meckel@mni.fh-giessen.de) for spotting this. - * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. - * - * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK - * locks. Changed process synchronisation to avoid dereferencing locks that - * have already been freed. - * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996. - * - * Made the block list a circular list to minimise searching in the list. - * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996. - * - * Made mandatory locking a mount option. Default is not to allow mandatory - * locking. - * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996. - * - * Some adaptations for NFS support. - * Olaf Kirch (okir@monad.swb.de), Dec 1996, - * - * Fixed /proc/locks interface so that we can't overrun the buffer we are handed. - * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997. - * - * Use slab allocator instead of kmalloc/kfree. - * Use generic list implementation from <linux/list.h>. - * Sped up posix_locks_deadlock by only considering blocked locks. - * Matthew Wilcox <willy@debian.org>, March, 2000. - * - * Leases and LOCK_MAND - * Matthew Wilcox <willy@debian.org>, June, 2000. - * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000. * * Locking conflicts and dependencies: * If multiple threads attempt to lock the same byte (or flock the same file) @@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl) } static inline int flock_translate_cmd(int cmd) { - if (cmd & LOCK_MAND) - return cmd & (LOCK_MAND | LOCK_RW); switch (cmd) { case LOCK_SH: return F_RDLCK; @@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl, */ if (caller_fl->fl_file == sys_fl->fl_file) return false; - if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND)) - return false; return locks_conflict(caller_fl, sys_fl); } @@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); * - %LOCK_SH -- a shared lock. * - %LOCK_EX -- an exclusive lock. * - %LOCK_UN -- remove an existing lock. - * - %LOCK_MAND -- a 'mandatory' flock. - * This exists to emulate Windows Share Modes. + * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED) * - * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other - * processes read and write access respectively. + * %LOCK_MAND support has been removed from the kernel. */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { @@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) cmd &= ~LOCK_NB; unlock = (cmd == LOCK_UN); - if (!unlock && !(cmd & LOCK_MAND) && - !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) + if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) + goto out_putf; + + /* + * LOCK_MAND locks were broken for a long time in that they never + * conflicted with one another and didn't prevent any sort of open, + * read or write activity. + * + * Just ignore these requests now, to preserve legacy behavior, but + * throw a warning to let people know that they don't actually work. + */ + if (cmd & LOCK_MAND) { + pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); + error = 0; goto out_putf; + } lock = flock_make_lock(f.file, cmd, NULL); if (IS_ERR(lock)) { @@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, struct inode *inode = NULL; unsigned int fl_pid; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); + int type; fl_pid = locks_translate_pid(fl, proc_pidns); /* @@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : "ADVISORY "); } else if (IS_FLOCK(fl)) { - if (fl->fl_type & LOCK_MAND) { - seq_puts(f, "FLOCK MSNFS "); - } else { - seq_puts(f, "FLOCK ADVISORY "); - } + seq_puts(f, "FLOCK ADVISORY "); } else if (IS_LEASE(fl)) { if (fl->fl_flags & FL_DELEG) seq_puts(f, "DELEG "); @@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } else { seq_puts(f, "UNKNOWN UNKNOWN "); } - if (fl->fl_type & LOCK_MAND) { - seq_printf(f, "%s ", - (fl->fl_type & LOCK_READ) - ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " - : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); - } else { - int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; + type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; - seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : - (type == F_RDLCK) ? "READ" : "UNLCK"); - } + seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : + (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, diff --git a/fs/namei.c b/fs/namei.c index 1946d9667790..1f9d2187c765 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) int error = get_write_access(inode); if (error) return error; - /* - * Refuse to truncate files with mandatory locks held on them. - */ + error = security_path_truncate(path); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 0b6cd3b8734c..994ec22d4040 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -150,7 +150,7 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq) { struct iov_iter iter; - iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages, + iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); iov_iter_zero(iov_iter_count(&iter), &iter); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index acb1d22907da..5e56da748b2a 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, d->bdev = bdev; - d->len = i_size_read(d->bdev->bd_inode); + d->len = bdev_nr_bytes(d->bdev); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", @@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return PTR_ERR(bdev); d->bdev = bdev; - d->len = i_size_read(d->bdev->bd_inode); + d->len = bdev_nr_bytes(d->bdev); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 2e894fec036b..7a5f287c5391 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) res = (long) dreq->count; WARN_ON_ONCE(dreq->count < 0); } - dreq->iocb->ki_complete(dreq->iocb, res, 0); + dreq->iocb->ki_complete(dreq->iocb, res); } complete(&dreq->completion); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index aa353fd58240..24e7dccce355 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - /* - * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of - * any standard. In principle we might be able to support LOCK_MAND - * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the - * NFS code is not set up for it. - */ - if (fl->fl_type & LOCK_MAND) - return -EINVAL; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index edec45831585..0a9b72685f98 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -42,7 +42,6 @@ EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace - * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 6e9ea4ee0f73..3d1d17256a91 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT depends on NFSD_V4 && BLOCK select NFSD_PNFS select EXPORTFS_BLOCK_OPS - select SCSI_COMMON help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index c99dee99a3c1..e5c0982a381d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -9,9 +9,6 @@ #include <linux/pr.h> #include <linux/nfsd/debug.h> -#include <scsi/scsi_proto.h> -#include <scsi/scsi_common.h> -#include <scsi/scsi_request.h> #include "blocklayoutxdr.h" #include "pnfs.h" @@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = { #endif /* CONFIG_NFSD_BLOCKLAYOUT */ #ifdef CONFIG_NFSD_SCSILAYOUT -static int nfsd4_scsi_identify_device(struct block_device *bdev, - struct pnfs_block_volume *b) -{ - struct request_queue *q = bdev->bd_disk->queue; - struct request *rq; - struct scsi_request *req; - /* - * The allocation length (passed in bytes 3 and 4 of the INQUIRY - * command descriptor block) specifies the number of bytes that have - * been allocated for the data-in buffer. - * 252 is the highest one-byte value that is a multiple of 4. - * 65532 is the highest two-byte value that is a multiple of 4. - */ - size_t bufflen = 252, maxlen = 65532, len, id_len; - u8 *buf, *d, type, assoc; - int retries = 1, error; - - if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) - return -EINVAL; - -again: - buf = kzalloc(bufflen, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rq = blk_get_request(q, REQ_OP_DRV_IN, 0); - if (IS_ERR(rq)) { - error = -ENOMEM; - goto out_free_buf; - } - req = scsi_req(rq); - - error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); - if (error) - goto out_put_request; - - req->cmd[0] = INQUIRY; - req->cmd[1] = 1; - req->cmd[2] = 0x83; - req->cmd[3] = bufflen >> 8; - req->cmd[4] = bufflen & 0xff; - req->cmd_len = COMMAND_SIZE(INQUIRY); - - blk_execute_rq(NULL, rq, 1); - if (req->result) { - pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", - req->result); - error = -EIO; - goto out_put_request; - } - - len = (buf[2] << 8) + buf[3] + 4; - if (len > bufflen) { - if (len <= maxlen && retries--) { - blk_put_request(rq); - kfree(buf); - bufflen = len; - goto again; - } - pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n", - len); - goto out_put_request; - } - - d = buf + 4; - for (d = buf + 4; d < buf + len; d += id_len + 4) { - id_len = d[3]; - type = d[1] & 0xf; - assoc = (d[1] >> 4) & 0x3; - - /* - * We only care about a EUI-64 and NAA designator types - * with LU association. - */ - if (assoc != 0x00) - continue; - if (type != 0x02 && type != 0x03) - continue; - if (id_len != 8 && id_len != 12 && id_len != 16) - continue; - - b->scsi.code_set = PS_CODE_SET_BINARY; - b->scsi.designator_type = type == 0x02 ? - PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA; - b->scsi.designator_len = id_len; - memcpy(b->scsi.designator, d + 4, id_len); - - /* - * If we found a 8 or 12 byte descriptor continue on to - * see if a 16 byte one is available. If we find a - * 16 byte descriptor we're done. - */ - if (id_len == 16) - break; - } - -out_put_request: - blk_put_request(rq); -out_free_buf: - kfree(buf); - return error; -} - #define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* @@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp) return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id; } +static const u8 designator_types[] = { + PS_DESIGNATOR_EUI64, + PS_DESIGNATOR_NAA, +}; + +static int +nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b) +{ + int ret, i; + + for (i = 0; i < ARRAY_SIZE(designator_types); i++) { + u8 type = designator_types[i]; + + ret = disk->fops->get_unique_id(disk, b->scsi.designator, type); + if (ret > 0) { + b->scsi.code_set = PS_CODE_SET_BINARY; + b->scsi.designator_type = type; + b->scsi.designator_len = ret; + return 0; + } + } + + return -EINVAL; +} + static int nfsd4_block_get_device_info_scsi(struct super_block *sb, struct nfs4_client *clp, @@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, struct pnfs_block_deviceaddr *dev; struct pnfs_block_volume *b; const struct pr_ops *ops; - int error; + int ret; dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + sizeof(struct pnfs_block_volume), GFP_KERNEL); @@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, b->type = PNFS_BLOCK_VOLUME_SCSI; b->scsi.pr_key = nfsd4_scsi_pr_key(clp); - error = nfsd4_scsi_identify_device(sb->s_bdev, b); - if (error) - return error; + ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b); + if (ret < 0) + goto out_free_dev; + ret = -EINVAL; ops = sb->s_bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: device %s does not support PRs.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); - if (error) { + ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); + if (ret) { pr_err("pNFS: failed to register key for device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, + ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, PR_EXCLUSIVE_ACCESS_REG_ONLY, 0); - if (error) { + if (ret) { pr_err("pNFS: failed to reserve device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } return 0; + +out_free_dev: + kfree(dev); + return ret; } static __be32 diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 7629248fdd53..be3c1aad50ea 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -542,7 +542,7 @@ nfsd_file_close_inode_sync(struct inode *inode) } /** - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file * @inode: inode of the file to attempt to remove * * Walk the whole hash bucket, looking for any files that correspond to "inode". diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index a97873f2d22b..6d1b5bb051c5 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp) #ifdef CONFIG_NFSD_SCSILAYOUT if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && - sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops && - blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue)) + sb->s_bdev && + sb->s_bdev->bd_disk->fops->pr_ops && + sb->s_bdev->bd_disk->fops->get_unique_id) exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7abeccb975b2..cf030ebe2827 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3544,15 +3544,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, goto fail; cd->rd_maxcount -= entry_bytes; /* - * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so - * let's always let through the first entry, at least: + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and + * notes that it could be zero. If it is zero, then the server + * should enforce only the rd_maxcount value. */ - if (!cd->rd_dircount) - goto fail; - name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; - if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) - goto fail; - cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (cd->rd_dircount) { + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (!cd->rd_dircount) + cd->rd_maxcount = 0; + } cd->cookie_offset = cookie_offset; skip_entry: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c2c3d9077dc5..070e5dd03e26 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -793,7 +793,10 @@ out_close: svc_xprt_put(xprt); } out_err: - nfsd_destroy(net); + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + nn->nfsd_serv->sv_nrthreads--; + else + nfsd_destroy(net); return err; } @@ -1545,7 +1548,7 @@ static int __init init_nfsd(void) goto out_free_all; return 0; out_free_all: - unregister_pernet_subsys(&nfsd_net_ops); + unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 640ac8fe891e..1d0583cfd970 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) goto out; ret = -ERANGE; - if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode)) + if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev)) goto out; segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f6b2d280aab5..3134c0e42fd4 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) int ret; ret = -ERANGE; - devsize = i_size_read(sb->s_bdev->bd_inode); + devsize = bdev_nr_bytes(sb->s_bdev); if (newsize > devsize) goto out; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index c8bfc01da5d7..1bfcb5d3ea48 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); int valid[2], swp = 0; sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ab4f3362466d..373dbb627657 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -5,6 +5,7 @@ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. */ +#include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/gfp.h> diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 0d7e948cb29c..5ae8de09b271 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_debug("Set device block size to %i bytes (block size bits %i).", blocksize, sb->s_blocksize_bits); /* Determine the size of the device in units of block_size bytes. */ - if (!i_size_read(sb->s_bdev->bd_inode)) { + vol->nr_blocks = sb_bdev_nr_blocks(sb); + if (!vol->nr_blocks) { if (!silent) ntfs_error(sb, "Unable to determine device size."); goto err_out_now; } - vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; /* Read the boot sector and return unlocked buffer head to it. */ if (!(bh = read_ntfs_boot_sector(sb, silent))) { if (!silent) @@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) goto err_out_now; } BUG_ON(blocksize != sb->s_blocksize); - vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + vol->nr_blocks = sb_bdev_nr_blocks(sb); ntfs_debug("Changed device block size to %i bytes (block size " "bits %i) to match volume sector size.", blocksize, sb->s_blocksize_bits); diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 34c4cbf7e29b..e8c00dda42ad 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -6,13 +6,9 @@ * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/hash.h> -#include <linux/nls.h> -#include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -291,7 +287,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, if (!rsize) { /* Empty resident -> Non empty nonresident. */ } else if (!is_data) { - err = ntfs_sb_write_run(sbi, run, 0, data, rsize); + err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0); if (err) goto out2; } else if (!page) { @@ -451,11 +447,8 @@ again: again_1: align = sbi->cluster_size; - if (is_ext) { + if (is_ext) align <<= attr_b->nres.c_unit; - if (is_attr_sparsed(attr_b)) - keep_prealloc = false; - } old_valid = le64_to_cpu(attr_b->nres.valid_size); old_size = le64_to_cpu(attr_b->nres.data_size); @@ -465,9 +458,6 @@ again_1: new_alloc = (new_size + align - 1) & ~(u64)(align - 1); new_alen = new_alloc >> cluster_bits; - if (keep_prealloc && is_ext) - keep_prealloc = false; - if (keep_prealloc && new_size < old_size) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = true; @@ -529,7 +519,7 @@ add_alloc_in_same_attr_seg: } else if (pre_alloc == -1) { pre_alloc = 0; if (type == ATTR_DATA && !name_len && - sbi->options.prealloc) { + sbi->options->prealloc) { CLST new_alen2 = bytes_to_cluster( sbi, get_pre_allocated(new_size)); pre_alloc = new_alen2 - new_alen; @@ -1966,7 +1956,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) return 0; from = vbo; - to = (vbo + bytes) < data_size ? (vbo + bytes) : data_size; + to = min_t(u64, vbo + bytes, data_size); memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); return 0; } diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index fa32399eb517..bad6d8a849a2 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" @@ -336,7 +333,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, if (attr && attr->non_res) { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, 0); if (err) return err; al->dirty = false; @@ -423,7 +420,7 @@ next: return true; } -int al_update(struct ntfs_inode *ni) +int al_update(struct ntfs_inode *ni, int sync) { int err; struct ATTRIB *attr; @@ -445,7 +442,7 @@ int al_update(struct ntfs_inode *ni) memcpy(resident_data(attr), al->le, al->size); } else { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, sync); if (err) goto out; diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c index ce304d40b5e1..50d838093790 100644 --- a/fs/ntfs3/bitfunc.c +++ b/fs/ntfs3/bitfunc.c @@ -5,13 +5,8 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/nls.h> +#include <linux/types.h> -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" #define BITS_IN_SIZE_T (sizeof(size_t) * 8) @@ -124,8 +119,7 @@ bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits) pos = nbits & 7; if (pos) { - u8 mask = fill_mask[pos]; - + mask = fill_mask[pos]; if ((*map & mask) != mask) return false; } diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 831501555009..aa184407520f 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -10,12 +10,10 @@ * */ -#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> -#include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" @@ -435,7 +433,7 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) ; } else { n3 = rb_next(&e->count.node); - max_new_len = len > new_len ? len : new_len; + max_new_len = max(len, new_len); if (!n3) { wnd->extent_max = max_new_len; } else { @@ -731,7 +729,7 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -784,7 +782,7 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -834,7 +832,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wbits != wnd->free_bits[iw]) { bool ret; @@ -926,7 +924,7 @@ use_wnd: wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wnd->free_bits[iw]) { bool ret; diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h index 31120569a87b..53ef7489c75f 100644 --- a/fs/ntfs3/debug.h +++ b/fs/ntfs3/debug.h @@ -11,6 +11,9 @@ #ifndef _LINUX_NTFS3_DEBUG_H #define _LINUX_NTFS3_DEBUG_H +struct super_block; +struct inode; + #ifndef Add2Ptr #define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I))) #define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B))) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 93f6d485564e..fb438d604040 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -7,10 +7,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> #include <linux/nls.h> #include "debug.h" @@ -18,30 +15,27 @@ #include "ntfs_fs.h" /* Convert little endian UTF-16 to NLS string. */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len) { - int ret, uni_len, warn; - const __le16 *ip; + int ret, warn; u8 *op; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; static_assert(sizeof(wchar_t) == sizeof(__le16)); if (!nls) { /* UTF-16 -> UTF-8 */ - ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len, - UTF16_LITTLE_ENDIAN, buf, buf_len); + ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf, + buf_len); buf[ret] = '\0'; return ret; } - ip = uni->name; op = buf; - uni_len = uni->len; warn = 0; - while (uni_len--) { + while (len--) { u16 ec; int charlen; char dump[5]; @@ -52,7 +46,7 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, break; } - ec = le16_to_cpu(*ip++); + ec = le16_to_cpu(*name++); charlen = nls->uni2char(ec, op, buf_len); if (charlen > 0) { @@ -186,7 +180,7 @@ int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, { int ret, slen; const u8 *end; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; u16 *uname = uni->name; static_assert(sizeof(wchar_t) == sizeof(u16)); @@ -301,14 +295,14 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; /* Skip meta files. Unless option to show metafiles is set. */ - if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino)) + if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino)) return 0; - if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) + if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) return 0; - name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len, - name, PATH_MAX); + name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name, + PATH_MAX); if (name_len <= 0) { ntfs_warn(sbi->sb, "failed to convert name for inode %lx.", ino); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 424450e77ad5..a3cd3c3f091e 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -8,11 +8,11 @@ */ #include <linux/backing-dev.h> +#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/compat.h> #include <linux/falloc.h> #include <linux/fiemap.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" @@ -588,8 +588,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) truncate_pagecache(inode, vbo_down); if (!is_sparsed(ni) && !is_compressed(ni)) { - /* Normal file. */ - err = ntfs_zero_range(inode, vbo, end); + /* + * Normal file, can't make hole. + * TODO: Try to find way to save info about hole. + */ + err = -EOPNOTSUPP; goto out; } @@ -737,7 +740,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode = inode->i_mode; int err; - if (sbi->options.no_acs_rules) { + if (sbi->options->noacsrules) { /* "No access rules" - Force any changes of time etc. */ attr->ia_valid |= ATTR_FORCE; /* and disable for editing some attributes. */ @@ -1185,7 +1188,7 @@ static int ntfs_file_release(struct inode *inode, struct file *file) int err = 0; /* If we are last writer on the inode, drop the block reservation. */ - if (sbi->options.prealloc && ((file->f_mode & FMODE_WRITE) && + if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && atomic_read(&inode->i_writecount) == 1)) { ni_lock(ni); down_write(&ni->file.run_lock); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 938b12d56ca6..6f47a9c17f89 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -5,11 +5,8 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fiemap.h> #include <linux/fs.h> -#include <linux/nls.h> #include <linux/vmalloc.h> #include "debug.h" @@ -708,18 +705,35 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) continue; mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) { + /* Should never happened, 'cause already checked. */ + goto bad; + } attr = mi_find_attr(mi, NULL, le->type, le_name(le), le->name_len, &le->id); + if (!attr) { + /* Should never happened, 'cause already checked. */ + goto bad; + } asize = le32_to_cpu(attr->size); /* Insert into primary record. */ attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), le->name_len, asize, le16_to_cpu(attr->name_off)); - id = attr_ins->id; + if (!attr_ins) { + /* + * Internal error. + * Either no space in primary record (already checked). + * Either tried to insert another + * non indexed attribute (logic error). + */ + goto bad; + } /* Copy all except id. */ + id = attr_ins->id; memcpy(attr_ins, attr, asize); attr_ins->id = id; @@ -735,6 +749,10 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) ni->attr_list.dirty = false; return 0; +bad: + ntfs_inode_err(&ni->vfs_inode, "Internal error"); + make_bad_inode(&ni->vfs_inode); + return -EINVAL; } /* @@ -956,6 +974,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, continue; } + /* + * Do not try to insert this attribute + * if there is no room in record. + */ + if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size) + continue; + /* Try to insert attribute into this subrecord. */ attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, name_off, svcn, ins_le); @@ -1451,7 +1476,7 @@ int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, attr->res.flags = RESIDENT_FLAG_INDEXED; /* is_attr_indexed(attr)) == true */ - le16_add_cpu(&ni->mi.mrec->hard_links, +1); + le16_add_cpu(&ni->mi.mrec->hard_links, 1); ni->mi.dirty = true; } attr->res.res = 0; @@ -1606,7 +1631,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, *le = NULL; - if (FILE_NAME_POSIX == name_type) + if (name_type == FILE_NAME_POSIX) return NULL; /* Enumerate all names. */ @@ -1706,18 +1731,16 @@ out: /* * ni_parse_reparse * - * Buffer is at least 24 bytes. + * buffer - memory for reparse buffer header */ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer) + struct REPARSE_DATA_BUFFER *buffer) { const struct REPARSE_DATA_BUFFER *rp = NULL; u8 bits; u16 len; typeof(rp->CompressReparseBuffer) *cmpr; - static_assert(sizeof(struct REPARSE_DATA_BUFFER) <= 24); - /* Try to estimate reparse point. */ if (!attr->non_res) { rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); @@ -1803,6 +1826,9 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, return REPARSE_NONE; } + if (buffer != rp) + memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER)); + /* Looks like normal symlink. */ return REPARSE_LINK; } @@ -2906,9 +2932,8 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); mi_get_ref(&ni->mi, &de->ref); - if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) { + if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) return false; - } } return true; @@ -3077,7 +3102,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, const struct EA_INFO *info; info = resident_data_ex(attr, sizeof(struct EA_INFO)); - dup->ea_size = info->size_pack; + /* If ATTR_EA_INFO exists 'info' can't be NULL. */ + if (info) + dup->ea_size = info->size_pack; } } @@ -3205,7 +3232,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) goto out; } - err = al_update(ni); + err = al_update(ni, sync); if (err) goto out; } diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index b5853aed0e25..06492f088d60 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -6,12 +6,8 @@ */ #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/hash.h> -#include <linux/nls.h> #include <linux/random.h> -#include <linux/ratelimit.h> #include <linux/slab.h> #include "debug.h" @@ -2219,7 +2215,7 @@ file_is_valid: err = ntfs_sb_write_run(log->ni->mi.sbi, &log->ni->file.run, off, page, - log->page_size); + log->page_size, 0); if (err) goto out; @@ -3710,7 +3706,7 @@ move_data: if (a_dirty) { attr = oa->attr; - err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes); + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); if (err) goto out; } @@ -5152,10 +5148,10 @@ end_reply: ntfs_fix_pre_write(&rh->rhdr, log->page_size); - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0); if (!err) err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, - rh, log->page_size); + rh, log->page_size, 0); kfree(rh); if (err) diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 91e3743e1442..4de9acb16968 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -8,7 +8,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -358,7 +358,7 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, enum ALLOCATE_OPT opt) { int err; - CLST alen = 0; + CLST alen; struct super_block *sb = sbi->sb; size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; @@ -370,27 +370,28 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, if (!zlen) { err = ntfs_refresh_zone(sbi); if (err) - goto out; + goto up_write; + zlen = wnd_zone_len(wnd); } if (!zlen) { ntfs_err(sbi->sb, "no free space to extend mft"); - goto out; + err = -ENOSPC; + goto up_write; } lcn = wnd_zone_bit(wnd); - alen = zlen > len ? len : zlen; + alen = min_t(CLST, len, zlen); wnd_zone_set(wnd, lcn + alen, zlen - alen); err = wnd_set_used(wnd, lcn, alen); - if (err) { - up_write(&wnd->rw_lock); - return err; - } + if (err) + goto up_write; + alcn = lcn; - goto out; + goto space_found; } /* * 'Cause cluster 0 is always used this value means that we should use @@ -404,49 +405,45 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn); if (alen) - goto out; + goto space_found; /* Try to use clusters from MftZone. */ zlen = wnd_zone_len(wnd); zeroes = wnd_zeroes(wnd); /* Check too big request */ - if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) - goto out; + if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) { + err = -ENOSPC; + goto up_write; + } /* How many clusters to cat from zone. */ zlcn = wnd_zone_bit(wnd); zlen2 = zlen >> 1; - ztrim = len > zlen ? zlen : (len > zlen2 ? len : zlen2); - new_zlen = zlen - ztrim; - - if (new_zlen < NTFS_MIN_MFT_ZONE) { - new_zlen = NTFS_MIN_MFT_ZONE; - if (new_zlen > zlen) - new_zlen = zlen; - } + ztrim = clamp_val(len, zlen2, zlen); + new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE); wnd_zone_set(wnd, zlcn, new_zlen); /* Allocate continues clusters. */ alen = wnd_find(wnd, len, 0, BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn); - -out: - if (alen) { - err = 0; - *new_len = alen; - *new_lcn = alcn; - - ntfs_unmap_meta(sb, alcn, alen); - - /* Set hint for next requests. */ - if (!(opt & ALLOCATE_MFT)) - sbi->used.next_free_lcn = alcn + alen; - } else { + if (!alen) { err = -ENOSPC; + goto up_write; } +space_found: + err = 0; + *new_len = alen; + *new_lcn = alcn; + + ntfs_unmap_meta(sb, alcn, alen); + + /* Set hint for next requests. */ + if (!(opt & ALLOCATE_MFT)) + sbi->used.next_free_lcn = alcn + alen; +up_write: up_write(&wnd->rw_lock); return err; } @@ -1080,7 +1077,7 @@ int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, } int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes) + u64 vbo, const void *buf, size_t bytes, int sync) { struct super_block *sb = sbi->sb; u8 cluster_bits = sbi->cluster_bits; @@ -1099,8 +1096,8 @@ int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, len = ((u64)clen << cluster_bits) - off; for (;;) { - u32 op = len < bytes ? len : bytes; - int err = ntfs_sb_write(sb, lbo, op, buf, 0); + u32 op = min_t(u64, len, bytes); + int err = ntfs_sb_write(sb, lbo, op, buf, sync); if (err) return err; @@ -1300,7 +1297,7 @@ int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, nb->off = off = lbo & (blocksize - 1); for (;;) { - u32 len32 = len < bytes ? len : bytes; + u32 len32 = min_t(u64, len, bytes); sector_t block = lbo >> sb->s_blocksize_bits; do { @@ -2175,7 +2172,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write main SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off, - d_security, aligned_sec_size); + d_security, aligned_sec_size, 0); if (err) goto out; @@ -2193,7 +2190,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write copy SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security, - aligned_sec_size); + aligned_sec_size, 0); if (err) goto out; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 0daca9adc54c..6f81e3a49abf 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -8,7 +8,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -671,138 +671,74 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, const struct INDEX_HDR *hdr, const void *key, size_t key_len, const void *ctx, int *diff) { - struct NTFS_DE *e; + struct NTFS_DE *e, *found = NULL; NTFS_CMP_FUNC cmp = indx->cmp; + int min_idx = 0, mid_idx, max_idx = 0; + int diff2; + int table_size = 8; u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); + u16 offs[128]; -#ifdef NTFS3_INDEX_BINARY_SEARCH - int max_idx = 0, fnd, min_idx; - int nslots = 64; - u16 *offs; - - if (end > 0x10000) - goto next; - - offs = kmalloc(sizeof(u16) * nslots, GFP_NOFS); - if (!offs) - goto next; +fill_table: + if (off + sizeof(struct NTFS_DE) > end) + return NULL; - /* Use binary search algorithm. */ -next1: - if (off + sizeof(struct NTFS_DE) > end) { - e = NULL; - goto out1; - } e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); - if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) { - e = NULL; - goto out1; - } - - if (max_idx >= nslots) { - u16 *ptr; - int new_slots = ALIGN(2 * nslots, 8); - - ptr = kmalloc(sizeof(u16) * new_slots, GFP_NOFS); - if (ptr) - memcpy(ptr, offs, sizeof(u16) * max_idx); - kfree(offs); - offs = ptr; - nslots = new_slots; - if (!ptr) - goto next; - } - - /* Store entry table. */ - offs[max_idx] = off; + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) + return NULL; if (!de_is_last(e)) { + offs[max_idx] = off; off += e_size; - max_idx += 1; - goto next1; - } - /* - * Table of pointers is created. - * Use binary search to find entry that is <= to the search value. - */ - fnd = -1; - min_idx = 0; + max_idx++; + if (max_idx < table_size) + goto fill_table; - while (min_idx <= max_idx) { - int mid_idx = min_idx + ((max_idx - min_idx) >> 1); - int diff2; - - e = Add2Ptr(hdr, offs[mid_idx]); + max_idx--; + } - e_key_len = le16_to_cpu(e->key_size); +binary_search: + e_key_len = le16_to_cpu(e->key_size); - diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + if (diff2 > 0) { + if (found) { + min_idx = mid_idx + 1; + } else { + if (de_is_last(e)) + return NULL; - if (!diff2) { - *diff = 0; - goto out1; + max_idx = 0; + table_size = min(table_size * 2, + (int)ARRAY_SIZE(offs)); + goto fill_table; } - - if (diff2 < 0) { + } else if (diff2 < 0) { + if (found) max_idx = mid_idx - 1; - fnd = mid_idx; - if (!fnd) - break; - } else { - min_idx = mid_idx + 1; - } - } + else + max_idx--; - if (fnd == -1) { - e = NULL; - goto out1; + found = e; + } else { + *diff = 0; + return e; } - *diff = -1; - e = Add2Ptr(hdr, offs[fnd]); - -out1: - kfree(offs); - - return e; -#endif - -next: - /* - * Entries index are sorted. - * Enumerate all entries until we find entry - * that is <= to the search value. - */ - if (off + sizeof(struct NTFS_DE) > end) - return NULL; - - e = Add2Ptr(hdr, off); - e_size = le16_to_cpu(e->size); - - if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) - return NULL; - - off += e_size; - - e_key_len = le16_to_cpu(e->key_size); - - *diff = (*cmp)(key, key_len, e + 1, e_key_len, ctx); - if (!*diff) - return e; + if (min_idx > max_idx) { + *diff = -1; + return found; + } - if (*diff <= 0) - return e; + mid_idx = (min_idx + max_idx) >> 1; + e = Add2Ptr(hdr, offs[mid_idx]); - if (de_is_last(e)) { - *diff = 1; - return e; - } - goto next; + goto binary_search; } /* @@ -1136,9 +1072,7 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, if (!e) return -EINVAL; - if (fnd) - fnd->root_de = e; - + fnd->root_de = e; err = 0; for (;;) { @@ -1401,7 +1335,7 @@ ok: static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { - int err = -ENOMEM; + int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *bitmap; struct ATTRIB *alloc; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index db2a5a4c38e4..a87ab3ad3cd3 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -5,10 +5,8 @@ * */ -#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/nls.h> @@ -49,8 +47,8 @@ static struct inode *ntfs_read_mft(struct inode *inode, inode->i_op = NULL; /* Setup 'uid' and 'gid' */ - inode->i_uid = sbi->options.fs_uid; - inode->i_gid = sbi->options.fs_gid; + inode->i_uid = sbi->options->fs_uid; + inode->i_gid = sbi->options->fs_gid; err = mi_init(&ni->mi, sbi, ino); if (err) @@ -224,12 +222,9 @@ next_attr: if (!attr->non_res) { ni->i_valid = inode->i_size = rsize; inode_set_bytes(inode, rsize); - t32 = asize; - } else { - t32 = le16_to_cpu(attr->nres.run_off); } - mode = S_IFREG | (0777 & sbi->options.fs_fmask_inv); + mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv); if (!attr->non_res) { ni->ni_flags |= NI_FLAG_RESIDENT; @@ -272,7 +267,7 @@ next_attr: goto out; mode = sb->s_root - ? (S_IFDIR | (0777 & sbi->options.fs_dmask_inv)) + ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : (S_IFDIR | 0777); goto next_attr; @@ -315,17 +310,14 @@ next_attr: rp_fa = ni_parse_reparse(ni, attr, &rp); switch (rp_fa) { case REPARSE_LINK: - if (!attr->non_res) { - inode->i_size = rsize; - inode_set_bytes(inode, rsize); - t32 = asize; - } else { - inode->i_size = - le64_to_cpu(attr->nres.data_size); - t32 = le16_to_cpu(attr->nres.run_off); - } + /* + * Normal symlink. + * Assume one unicode symbol == one utf8. + */ + inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer + .PrintNameLength) / + sizeof(u16); - /* Looks like normal symlink. */ ni->i_valid = inode->i_size; /* Clear directory bit. */ @@ -422,7 +414,7 @@ end_enum: ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; - inode_nohighmem(inode); // ?? + inode_nohighmem(inode); } else if (S_ISREG(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; @@ -443,7 +435,7 @@ end_enum: goto out; } - if ((sbi->options.sys_immutable && + if ((sbi->options->sys_immutable && (std5->fa & FILE_ATTRIBUTE_SYSTEM)) && !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) { inode->i_flags |= S_IMMUTABLE; @@ -1054,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, if (!ret && i2) ret = writeback_inode(i2); if (!ret) - ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping); + ret = sync_blockdev_nowait(sb->s_bdev); return ret; } @@ -1200,9 +1192,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, struct REPARSE_DATA_BUFFER *rp = NULL; bool rp_inserted = false; + ni_lock_dir(dir_ni); + dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); - if (!dir_root) - return ERR_PTR(-EINVAL); + if (!dir_root) { + err = -EINVAL; + goto out1; + } if (S_ISDIR(mode)) { /* Use parent's directory attributes. */ @@ -1244,7 +1240,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, * } */ } else if (S_ISREG(mode)) { - if (sbi->options.sparse) { + if (sbi->options->sparse) { /* Sparsed regular file, cause option 'sparse'. */ fa = FILE_ATTRIBUTE_SPARSE_FILE | FILE_ATTRIBUTE_ARCHIVE; @@ -1486,7 +1482,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, asize = ALIGN(SIZEOF_RESIDENT + nsize, 8); t16 = PtrOffset(rec, attr); - /* 0x78 - the size of EA + EAINFO to store WSL */ + /* + * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes. + * It is good idea to keep extened attributes resident. + */ if (asize + t16 + 0x78 + 8 > sbi->record_size) { CLST alen; CLST clst = bytes_to_cluster(sbi, nsize); @@ -1521,14 +1520,14 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, } asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); - inode->i_size = nsize; } else { attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(nsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); - inode->i_size = nsize; nsize = 0; } + /* Size of symlink equals the length of input string. */ + inode->i_size = size; attr->size = cpu_to_le32(asize); @@ -1551,6 +1550,9 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (err) goto out6; + /* Unlock parent directory before ntfs_init_acl. */ + ni_unlock(dir_ni); + inode->i_generation = le16_to_cpu(rec->seq); dir->i_mtime = dir->i_ctime = inode->i_atime; @@ -1562,6 +1564,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; inode->i_mapping->a_ops = &ntfs_aops; + inode->i_size = size; + inode_nohighmem(inode); } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; @@ -1577,7 +1581,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { err = ntfs_init_acl(mnt_userns, inode, dir); if (err) - goto out6; + goto out7; } else #endif { @@ -1586,7 +1590,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, /* Write non resident data. */ if (nsize) { - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); if (err) goto out7; } @@ -1607,8 +1611,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, out7: /* Undo 'indx_insert_entry'. */ + ni_lock_dir(dir_ni); indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, le16_to_cpu(new_de->key_size), sbi); + /* ni_unlock(dir_ni); will be called later. */ out6: if (rp_inserted) ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); @@ -1632,8 +1638,10 @@ out2: kfree(rp); out1: - if (err) + if (err) { + ni_unlock(dir_ni); return ERR_PTR(err); + } unlock_new_inode(inode); @@ -1754,15 +1762,15 @@ void ntfs_evict_inode(struct inode *inode) static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, int buflen) { - int i, err = 0; + int i, err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - u64 i_size = inode->i_size; - u16 nlen = 0; + u64 size; + u16 ulen = 0; void *to_free = NULL; struct REPARSE_DATA_BUFFER *rp; - struct le_str *uni; + const __le16 *uname; struct ATTRIB *attr; /* Reparse data present. Try to parse it. */ @@ -1771,68 +1779,64 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, *buffer = 0; - /* Read into temporal buffer. */ - if (i_size > sbi->reparse.max_size || i_size <= sizeof(u32)) { - err = -EINVAL; - goto out; - } - attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); - if (!attr) { - err = -EINVAL; + if (!attr) goto out; - } if (!attr->non_res) { - rp = resident_data_ex(attr, i_size); - if (!rp) { - err = -EINVAL; + rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); + if (!rp) goto out; - } + size = le32_to_cpu(attr->res.data_size); } else { - rp = kmalloc(i_size, GFP_NOFS); + size = le64_to_cpu(attr->nres.data_size); + rp = NULL; + } + + if (size > sbi->reparse.max_size || size <= sizeof(u32)) + goto out; + + if (!rp) { + rp = kmalloc(size, GFP_NOFS); if (!rp) { err = -ENOMEM; goto out; } to_free = rp; - err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, i_size, NULL); + /* Read into temporal buffer. */ + err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL); if (err) goto out; } - err = -EINVAL; - /* Microsoft Tag. */ switch (rp->ReparseTag) { case IO_REPARSE_TAG_MOUNT_POINT: /* Mount points and junctions. */ /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer) + - le16_to_cpu(rp->MountPointReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); + uname = Add2Ptr(rp, + offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer) + + le16_to_cpu(rp->MountPointReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); break; case IO_REPARSE_TAG_SYMLINK: /* FolderSymbolicLink */ /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer) + - le16_to_cpu(rp->SymbolicLinkReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu( + uname = Add2Ptr( + rp, offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer) + + le16_to_cpu(rp->SymbolicLinkReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu( rp->SymbolicLinkReparseBuffer.PrintNameLength); break; @@ -1864,29 +1868,28 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, goto out; } if (!IsReparseTagNameSurrogate(rp->ReparseTag) || - i_size <= sizeof(struct REPARSE_POINT)) { + size <= sizeof(struct REPARSE_POINT)) { goto out; } /* Users tag. */ - uni = Add2Ptr(rp, sizeof(struct REPARSE_POINT) - 2); - nlen = le16_to_cpu(rp->ReparseDataLength) - + uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT)); + ulen = le16_to_cpu(rp->ReparseDataLength) - sizeof(struct REPARSE_POINT); } /* Convert nlen from bytes to UNICODE chars. */ - nlen >>= 1; + ulen >>= 1; /* Check that name is available. */ - if (!nlen || &uni->name[nlen] > (__le16 *)Add2Ptr(rp, i_size)) + if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size)) goto out; /* If name is already zero terminated then truncate it now. */ - if (!uni->name[nlen - 1]) - nlen -= 1; - uni->len = nlen; + if (!uname[ulen - 1]) + ulen -= 1; - err = ntfs_utf16_to_nls(sbi, uni, buffer, buflen); + err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen); if (err < 0) goto out; diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h index 2d70ae42f1b5..dd7ced000d0e 100644 --- a/fs/ntfs3/lib/decompress_common.h +++ b/fs/ntfs3/lib/decompress_common.h @@ -5,6 +5,9 @@ * Copyright (C) 2015 Eric Biggers */ +#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H +#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H + #include <linux/string.h> #include <linux/compiler.h> #include <linux/types.h> @@ -336,3 +339,5 @@ static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend return dst; } + +#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */ diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h index f508fbad2e71..90309a5ae59c 100644 --- a/fs/ntfs3/lib/lib.h +++ b/fs/ntfs3/lib/lib.h @@ -7,6 +7,10 @@ * - linux kernel code style */ +#ifndef _LINUX_NTFS3_LIB_LIB_H +#define _LINUX_NTFS3_LIB_LIB_H + +#include <linux/types.h> /* globals from xpress_decompress.c */ struct xpress_decompressor *xpress_allocate_decompressor(void); @@ -24,3 +28,5 @@ int lzx_decompress(struct lzx_decompressor *__restrict d, const void *__restrict compressed_data, size_t compressed_size, void *__restrict uncompressed_data, size_t uncompressed_size); + +#endif /* _LINUX_NTFS3_LIB_LIB_H */ diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index f1f691a67cc4..28f654561f27 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -5,13 +5,13 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> #include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" // clang-format off @@ -292,7 +292,7 @@ next: /* * get_lznt_ctx * @level: 0 - Standard compression. - * !0 - Best compression, requires a lot of cpu. + * !0 - Best compression, requires a lot of cpu. */ struct lznt *get_lznt_ctx(int level) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index e58415d07132..bc741213ad84 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -5,11 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> -#include <linux/namei.h> #include <linux/nls.h> #include "debug.h" @@ -99,16 +95,11 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -120,16 +111,11 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -200,15 +186,10 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, { u32 size = strlen(symname); struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -219,15 +200,10 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 6bb3e595263b..9cc396b117bf 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -10,19 +10,27 @@ #ifndef _LINUX_NTFS3_NTFS_H #define _LINUX_NTFS3_NTFS_H -/* TODO: Check 4K MFT record and 512 bytes cluster. */ +#include <linux/blkdev.h> +#include <linux/build_bug.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "debug.h" -/* Activate this define to use binary search in indexes. */ -#define NTFS3_INDEX_BINARY_SEARCH +/* TODO: Check 4K MFT record and 512 bytes cluster. */ /* Check each run for marked clusters. */ #define NTFS3_CHECK_FREE_CLST #define NTFS_NAME_LEN 255 -/* ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. */ -#define NTFS_LINK_MAX 0x400 -//#define NTFS_LINK_MAX 0xffff +/* + * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. + * xfstest generic/041 creates 3003 hardlinks. + */ +#define NTFS_LINK_MAX 4000 /* * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys. diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index dc71c59fd445..8aaec7e0804e 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -9,6 +9,37 @@ #ifndef _LINUX_NTFS3_NTFS_FS_H #define _LINUX_NTFS3_NTFS_FS_H +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/cleancache.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/page-flags.h> +#include <linux/pagemap.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/time64.h> +#include <linux/types.h> +#include <linux/uidgid.h> +#include <asm/div64.h> +#include <asm/page.h> + +#include "debug.h" +#include "ntfs.h" + +struct dentry; +struct fiemap_extent_info; +struct user_namespace; +struct page; +struct writeback_control; +enum utf16_endian; + + #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 @@ -52,6 +83,7 @@ // clang-format on struct ntfs_mount_options { + char *nls_name; struct nls_table *nls; kuid_t fs_uid; @@ -59,19 +91,16 @@ struct ntfs_mount_options { u16 fs_fmask_inv; u16 fs_dmask_inv; - unsigned uid : 1, /* uid was set. */ - gid : 1, /* gid was set. */ - fmask : 1, /* fmask was set. */ - dmask : 1, /* dmask was set. */ - sys_immutable : 1, /* Immutable system files. */ - discard : 1, /* Issue discard requests on deletions. */ - sparse : 1, /* Create sparse files. */ - showmeta : 1, /* Show meta files. */ - nohidden : 1, /* Do not show hidden files. */ - force : 1, /* Rw mount dirty volume. */ - no_acs_rules : 1, /*Exclude acs rules. */ - prealloc : 1 /* Preallocate space when file is growing. */ - ; + unsigned fmask : 1; /* fmask was set. */ + unsigned dmask : 1; /*dmask was set. */ + unsigned sys_immutable : 1; /* Immutable system files. */ + unsigned discard : 1; /* Issue discard requests on deletions. */ + unsigned sparse : 1; /* Create sparse files. */ + unsigned showmeta : 1; /* Show meta files. */ + unsigned nohidden : 1; /* Do not show hidden files. */ + unsigned force : 1; /* RW mount dirty volume. */ + unsigned noacsrules : 1; /* Exclude acs rules. */ + unsigned prealloc : 1; /* Preallocate space when file is growing. */ }; /* Special value to unpack and deallocate. */ @@ -182,10 +211,8 @@ struct ntfs_sb_info { u32 blocks_per_cluster; // cluster_size / sb->s_blocksize u32 record_size; - u32 sector_size; u32 index_size; - u8 sector_bits; u8 cluster_bits; u8 record_bits; @@ -279,7 +306,7 @@ struct ntfs_sb_info { #endif } compress; - struct ntfs_mount_options options; + struct ntfs_mount_options *options; struct ratelimit_state msg_ratelimit; }; @@ -436,7 +463,7 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, const __le16 *name, size_t name_len, const struct MFT_REF *ref); -int al_update(struct ntfs_inode *ni); +int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { return (size + 1023) & ~(size_t)1023; @@ -448,7 +475,7 @@ bool are_bits_set(const ulong *map, size_t bit, size_t nbits); size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits); /* Globals from dir.c */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len); int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, struct cpu_str *uni, u32 max_ulen, @@ -520,7 +547,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, struct ATTR_LIST_ENTRY **entry); int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer); + struct REPARSE_DATA_BUFFER *buffer); int ni_write_inode(struct inode *inode, int sync, const char *hint); #define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, @@ -577,7 +604,7 @@ int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes); + u64 vbo, const void *buf, size_t bytes, int sync); struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo); int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 103705c86772..861e35791506 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 26ed2b64345e..a8fec651f973 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -7,10 +7,8 @@ */ #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/log2.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 55bbc9200a10..29813200c7af 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -23,16 +23,15 @@ * */ -#include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/fs.h> -#include <linux/iversion.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/log2.h> #include <linux/module.h> #include <linux/nls.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/statfs.h> @@ -205,9 +204,11 @@ void *ntfs_put_shared(void *ptr) return ret; } -static inline void clear_mount_options(struct ntfs_mount_options *options) +static inline void put_mount_options(struct ntfs_mount_options *options) { + kfree(options->nls_name); unload_nls(options->nls); + kfree(options); } enum Opt { @@ -223,218 +224,175 @@ enum Opt { Opt_nohidden, Opt_showmeta, Opt_acl, - Opt_noatime, - Opt_nls, + Opt_iocharset, Opt_prealloc, - Opt_no_acs_rules, + Opt_noacsrules, Opt_err, }; -static const match_table_t ntfs_tokens = { - { Opt_uid, "uid=%u" }, - { Opt_gid, "gid=%u" }, - { Opt_umask, "umask=%o" }, - { Opt_dmask, "dmask=%o" }, - { Opt_fmask, "fmask=%o" }, - { Opt_immutable, "sys_immutable" }, - { Opt_discard, "discard" }, - { Opt_force, "force" }, - { Opt_sparse, "sparse" }, - { Opt_nohidden, "nohidden" }, - { Opt_acl, "acl" }, - { Opt_noatime, "noatime" }, - { Opt_showmeta, "showmeta" }, - { Opt_nls, "nls=%s" }, - { Opt_prealloc, "prealloc" }, - { Opt_no_acs_rules, "no_acs_rules" }, - { Opt_err, NULL }, +static const struct fs_parameter_spec ntfs_fs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_flag_no("sys_immutable", Opt_immutable), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag_no("force", Opt_force), + fsparam_flag_no("sparse", Opt_sparse), + fsparam_flag_no("hidden", Opt_nohidden), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("showmeta", Opt_showmeta), + fsparam_flag_no("prealloc", Opt_prealloc), + fsparam_flag_no("acsrules", Opt_noacsrules), + fsparam_string("iocharset", Opt_iocharset), + {} }; -static noinline int ntfs_parse_options(struct super_block *sb, char *options, - int silent, - struct ntfs_mount_options *opts) +/* + * Load nls table or if @nls is utf8 then return NULL. + */ +static struct nls_table *ntfs_load_nls(char *nls) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char nls_name[30]; - struct nls_table *nls; + struct nls_table *ret; - opts->fs_uid = current_uid(); - opts->fs_gid = current_gid(); - opts->fs_fmask_inv = opts->fs_dmask_inv = ~current_umask(); - nls_name[0] = 0; + if (!nls) + nls = CONFIG_NLS_DEFAULT; - if (!options) - goto out; + if (strcmp(nls, "utf8") == 0) + return NULL; - while ((p = strsep(&options, ","))) { - int token; + if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0) + return load_nls_default(); - if (!*p) - continue; + ret = load_nls(nls); + if (ret) + return ret; - token = match_token(p, ntfs_tokens, args); - switch (token) { - case Opt_immutable: - opts->sys_immutable = 1; - break; - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(opts->fs_uid)) - return -EINVAL; - opts->uid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(opts->fs_gid)) - return -EINVAL; - opts->gid = 1; - break; - case Opt_umask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = opts->fs_dmask_inv = ~option; - opts->fmask = opts->dmask = 1; - break; - case Opt_dmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_dmask_inv = ~option; - opts->dmask = 1; - break; - case Opt_fmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = ~option; - opts->fmask = 1; - break; - case Opt_discard: - opts->discard = 1; - break; - case Opt_force: - opts->force = 1; - break; - case Opt_sparse: - opts->sparse = 1; - break; - case Opt_nohidden: - opts->nohidden = 1; - break; - case Opt_acl: + return ERR_PTR(-EINVAL); +} + +static int ntfs_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ntfs_mount_options *opts = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, ntfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(opts->fs_uid)) + return invalf(fc, "ntfs3: Invalid value for uid."); + break; + case Opt_gid: + opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(opts->fs_gid)) + return invalf(fc, "ntfs3: Invalid value for gid."); + break; + case Opt_umask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for umask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fs_dmask_inv = ~result.uint_32; + opts->fmask = 1; + opts->dmask = 1; + break; + case Opt_dmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for dmask."); + opts->fs_dmask_inv = ~result.uint_32; + opts->dmask = 1; + break; + case Opt_fmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for fmask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fmask = 1; + break; + case Opt_immutable: + opts->sys_immutable = result.negated ? 0 : 1; + break; + case Opt_discard: + opts->discard = result.negated ? 0 : 1; + break; + case Opt_force: + opts->force = result.negated ? 0 : 1; + break; + case Opt_sparse: + opts->sparse = result.negated ? 0 : 1; + break; + case Opt_nohidden: + opts->nohidden = result.negated ? 1 : 0; + break; + case Opt_acl: + if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; - break; + fc->sb_flags |= SB_POSIXACL; #else - ntfs_err(sb, "support for ACL not compiled in!"); - return -EINVAL; + return invalf(fc, "ntfs3: Support for ACL not compiled in!"); #endif - case Opt_noatime: - sb->s_flags |= SB_NOATIME; - break; - case Opt_showmeta: - opts->showmeta = 1; - break; - case Opt_nls: - match_strlcpy(nls_name, &args[0], sizeof(nls_name)); - break; - case Opt_prealloc: - opts->prealloc = 1; - break; - case Opt_no_acs_rules: - opts->no_acs_rules = 1; - break; - default: - if (!silent) - ntfs_err( - sb, - "Unrecognized mount option \"%s\" or missing value", - p); - //return -EINVAL; - } - } - -out: - if (!strcmp(nls_name[0] ? nls_name : CONFIG_NLS_DEFAULT, "utf8")) { - /* - * For UTF-8 use utf16s_to_utf8s()/utf8s_to_utf16s() - * instead of NLS. - */ - nls = NULL; - } else if (nls_name[0]) { - nls = load_nls(nls_name); - if (!nls) { - ntfs_err(sb, "failed to load \"%s\"", nls_name); - return -EINVAL; - } - } else { - nls = load_nls_default(); - if (!nls) { - ntfs_err(sb, "failed to load default nls"); - return -EINVAL; - } + else + fc->sb_flags &= ~SB_POSIXACL; + break; + case Opt_showmeta: + opts->showmeta = result.negated ? 0 : 1; + break; + case Opt_iocharset: + kfree(opts->nls_name); + opts->nls_name = param->string; + param->string = NULL; + break; + case Opt_prealloc: + opts->prealloc = result.negated ? 0 : 1; + break; + case Opt_noacsrules: + opts->noacsrules = result.negated ? 1 : 0; + break; + default: + /* Should not be here unless we forget add case. */ + return -EINVAL; } - opts->nls = nls; - return 0; } -static int ntfs_remount(struct super_block *sb, int *flags, char *data) +static int ntfs_fs_reconfigure(struct fs_context *fc) { - int err, ro_rw; + struct super_block *sb = fc->root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options old_opts; - char *orig_data = kstrdup(data, GFP_KERNEL); - - if (data && !orig_data) - return -ENOMEM; + struct ntfs_mount_options *new_opts = fc->fs_private; + int ro_rw; - /* Store original options. */ - memcpy(&old_opts, &sbi->options, sizeof(old_opts)); - clear_mount_options(&sbi->options); - memset(&sbi->options, 0, sizeof(sbi->options)); - - err = ntfs_parse_options(sb, data, 0, &sbi->options); - if (err) - goto restore_opts; - - ro_rw = sb_rdonly(sb) && !(*flags & SB_RDONLY); + ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { - ntfs_warn( - sb, - "Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); - err = -EINVAL; - goto restore_opts; + errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + return -EINVAL; } + new_opts->nls = ntfs_load_nls(new_opts->nls_name); + if (IS_ERR(new_opts->nls)) { + new_opts->nls = NULL; + errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); + return -EINVAL; + } + if (new_opts->nls != sbi->options->nls) + return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!"); + sync_filesystem(sb); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && - !sbi->options.force) { - ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); - err = -EINVAL; - goto restore_opts; + !new_opts->force) { + errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); + return -EINVAL; } - clear_mount_options(&old_opts); - - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME) | - SB_NODIRATIME | SB_NOATIME; - ntfs_info(sb, "re-mounted. Opts: %s", orig_data); - err = 0; - goto out; + memcpy(sbi->options, new_opts, sizeof(*new_opts)); -restore_opts: - clear_mount_options(&sbi->options); - memcpy(&sbi->options, &old_opts, sizeof(old_opts)); - -out: - kfree(orig_data); - return err; + return 0; } static struct kmem_cache *ntfs_inode_cachep; @@ -513,8 +471,6 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi) xpress_free_decompressor(sbi->compress.xpress); lzx_free_decompressor(sbi->compress.lzx); #endif - clear_mount_options(&sbi->options); - kfree(sbi); } @@ -525,7 +481,9 @@ static void ntfs_put_super(struct super_block *sb) /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + put_mount_options(sbi->options); put_ntfs(sbi); + sb->s_fs_info = NULL; sync_blockdev(sb->s_bdev); } @@ -552,23 +510,21 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) { struct super_block *sb = root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options *opts = &sbi->options; + struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); - if (opts->uid) - seq_printf(m, ",uid=%u", - from_kuid_munged(user_ns, opts->fs_uid)); - if (opts->gid) - seq_printf(m, ",gid=%u", - from_kgid_munged(user_ns, opts->fs_gid)); + seq_printf(m, ",uid=%u", + from_kuid_munged(user_ns, opts->fs_uid)); + seq_printf(m, ",gid=%u", + from_kgid_munged(user_ns, opts->fs_gid)); if (opts->fmask) seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv); if (opts->dmask) seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv); if (opts->nls) - seq_printf(m, ",nls=%s", opts->nls->charset); + seq_printf(m, ",iocharset=%s", opts->nls->charset); else - seq_puts(m, ",nls=utf8"); + seq_puts(m, ",iocharset=utf8"); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) @@ -581,14 +537,12 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nohidden"); if (opts->force) seq_puts(m, ",force"); - if (opts->no_acs_rules) - seq_puts(m, ",no_acs_rules"); + if (opts->noacsrules) + seq_puts(m, ",noacsrules"); if (opts->prealloc) seq_puts(m, ",prealloc"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); - if (sb->s_flags & SB_NOATIME) - seq_puts(m, ",noatime"); return 0; } @@ -643,7 +597,6 @@ static const struct super_operations ntfs_sops = { .statfs = ntfs_statfs, .show_options = ntfs_show_options, .sync_fs = ntfs_sync_fs, - .remount_fs = ntfs_remount, .write_inode = ntfs3_write_inode, }; @@ -729,7 +682,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, fs_size, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -787,20 +740,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, goto out; } - sbi->sector_size = boot_sector_size; - sbi->sector_bits = blksize_bits(boot_sector_size); - fs_size = (sectors + 1) << sbi->sector_bits; + sbi->volume.size = sectors * boot_sector_size; - gb = format_size_gb(fs_size, &mb); + gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb); /* * - Volume formatted and mounted with the same sector size. * - Volume formatted 4K and mounted as 512. * - Volume formatted 512 and mounted as 4K. */ - if (sbi->sector_size != sector_size) { - ntfs_warn(sb, - "Different NTFS' sector size and media sector size"); + if (boot_sector_size != sector_size) { + ntfs_warn( + sb, + "Different NTFS' sector size (%u) and media sector size (%u)", + boot_sector_size, sector_size); dev_size += sector_size - 1; } @@ -810,8 +763,19 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->mft.lbo = mlcn << sbi->cluster_bits; sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; - if (sbi->cluster_size < sbi->sector_size) + /* Compare boot's cluster and sector. */ + if (sbi->cluster_size < boot_sector_size) + goto out; + + /* Compare boot's cluster and media sector. */ + if (sbi->cluster_size < sector_size) { + /* No way to use ntfs_get_block in this case. */ + ntfs_err( + sb, + "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)", + sbi->cluster_size, sector_size); goto out; + } sbi->cluster_mask = sbi->cluster_size - 1; sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; @@ -836,10 +800,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, : (u32)boot->index_size << sbi->cluster_bits; sbi->volume.ser_num = le64_to_cpu(boot->serial_num); - sbi->volume.size = sectors << sbi->sector_bits; /* Warning if RAW volume. */ - if (dev_size < fs_size) { + if (dev_size < sbi->volume.size + boot_sector_size) { u32 mb0, gb0; gb0 = format_size_gb(dev_size, &mb0); @@ -883,8 +846,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, rec->total = cpu_to_le32(sbi->record_size); ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; - if (sbi->cluster_size < PAGE_SIZE) - sb_set_blocksize(sb, sbi->cluster_size); + sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); sbi->block_mask = sb->s_blocksize - 1; sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; @@ -897,9 +859,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (clusters >= (1ull << (64 - sbi->cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; + sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; + sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; #endif err = 0; @@ -913,14 +877,13 @@ out: /* * ntfs_fill_super - Try to mount. */ -static int ntfs_fill_super(struct super_block *sb, void *data, int silent) +static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) { int err; - struct ntfs_sb_info *sbi; + struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; - struct inode *bd_inode = bdev->bd_inode; - struct request_queue *rq = bdev_get_queue(bdev); - struct inode *inode = NULL; + struct request_queue *rq; + struct inode *inode; struct ntfs_inode *ni; size_t i, tt; CLST vcn, lcn, len; @@ -928,18 +891,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) const struct VOLUME_INFO *info; u32 idx, done, bytes; struct ATTR_DEF_ENTRY *t; - u16 *upcase = NULL; u16 *shared; - bool is_ro; struct MFT_REF ref; ref.high = 0; - sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); - if (!sbi) - return -ENOMEM; - - sb->s_fs_info = sbi; sbi->sb = sb; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" @@ -948,41 +904,27 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; - ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - err = ntfs_parse_options(sb, data, silent, &sbi->options); - if (err) + sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); + if (IS_ERR(sbi->options->nls)) { + sbi->options->nls = NULL; + errorf(fc, "Cannot load nls %s", sbi->options->nls_name); + err = -EINVAL; goto out; + } - if (!rq || !blk_queue_discard(rq) || !rq->limits.discard_granularity) { - ; - } else { + rq = bdev_get_queue(bdev); + if (blk_queue_discard(rq) && rq->limits.discard_granularity) { sbi->discard_granularity = rq->limits.discard_granularity; sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } - sb_set_blocksize(sb, PAGE_SIZE); - /* Parse boot. */ err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, - bd_inode->i_size); + bdev_nr_bytes(bdev)); if (err) goto out; -#ifdef CONFIG_NTFS3_64BIT_CLUSTER - sb->s_maxbytes = MAX_LFS_FILESIZE; -#else - sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; -#endif - - mutex_init(&sbi->compress.mtx_lznt); -#ifdef CONFIG_NTFS3_LZX_XPRESS - mutex_init(&sbi->compress.mtx_xpress); - mutex_init(&sbi->compress.mtx_lzx); -#endif - /* * Load $Volume. This should be done before $LogFile * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'. @@ -991,9 +933,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Volume."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1015,36 +956,33 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) } else { /* Should we break mounting here? */ //err = -EINVAL; - //goto out; + //goto put_inode_out; } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); if (!attr || is_attr_ext(attr)) { err = -EINVAL; - goto out; + goto put_inode_out; } info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); if (!info) { err = -EINVAL; - goto out; + goto put_inode_out; } sbi->volume.major_ver = info->major_ver; sbi->volume.minor_ver = info->minor_ver; sbi->volume.flags = info->flags; - sbi->volume.ni = ni; - inode = NULL; /* Load $MFTMirr to estimate recs_mirr. */ ref.low = cpu_to_le32(MFT_REC_MIRR); ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFTMirr."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1058,9 +996,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1068,22 +1005,19 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = ntfs_loadlog_and_replay(ni, sbi); if (err) - goto out; + goto put_inode_out; iput(inode); - inode = NULL; - - is_ro = sb_rdonly(sbi->sb); if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { - if (!is_ro) { + if (!sb_rdonly(sb)) { ntfs_warn(sb, "failed to replay log file. Can't mount rw!"); err = -EINVAL; goto out; } } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { - if (!is_ro && !sbi->options.force) { + if (!sb_rdonly(sb) && !sbi->options->force) { ntfs_warn( sb, "volume is dirty and \"force\" flag is not set!"); @@ -1098,9 +1032,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFT."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1112,11 +1045,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = wnd_init(&sbi->mft.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; err = ni_load_all_mi(ni); if (err) - goto out; + goto put_inode_out; sbi->mft.ni = ni; @@ -1125,9 +1058,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BADCLUST); inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $BadClus."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1150,18 +1082,15 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Bitmap."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (inode->i_size >> 32) { err = -EINVAL; - goto out; + goto put_inode_out; } #endif @@ -1169,14 +1098,14 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) tt = sbi->used.bitmap.nbits; if (inode->i_size < bitmap_size(tt)) { err = -EINVAL; - goto out; + goto put_inode_out; } /* Not necessary. */ sbi->used.bitmap.set_tail = true; - err = wnd_init(&sbi->used.bitmap, sbi->sb, tt); + err = wnd_init(&sbi->used.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; iput(inode); @@ -1188,23 +1117,22 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) /* Load $AttrDef. */ ref.low = cpu_to_le32(MFT_REC_ATTR); ref.seq = cpu_to_le16(MFT_REC_ATTR); - inode = ntfs_iget5(sbi->sb, &ref, &NAME_ATTRDEF); + inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $AttrDef -> %d", err); - inode = NULL; + err = PTR_ERR(inode); goto out; } if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { err = -EINVAL; - goto out; + goto put_inode_out; } bytes = inode->i_size; sbi->def_table = t = kmalloc(bytes, GFP_NOFS); if (!t) { err = -ENOMEM; - goto out; + goto put_inode_out; } for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) { @@ -1213,7 +1141,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } memcpy(Add2Ptr(t, done), page_address(page), min(PAGE_SIZE, tail)); @@ -1221,7 +1149,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (!idx && ATTR_STD != t->type) { err = -EINVAL; - goto out; + goto put_inode_out; } } @@ -1254,33 +1182,24 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $UpCase."); err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; goto out; } - ni = ntfs_i(inode); - if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; - goto out; - } - - sbi->upcase = upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); - if (!upcase) { - err = -ENOMEM; - goto out; + goto put_inode_out; } for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) { const __le16 *src; - u16 *dst = Add2Ptr(upcase, idx << PAGE_SHIFT); + u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT); struct page *page = ntfs_map_page(inode->i_mapping, idx); if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } src = page_address(page); @@ -1294,14 +1213,13 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ntfs_unmap_page(page); } - shared = ntfs_set_shared(upcase, 0x10000 * sizeof(short)); - if (shared && upcase != shared) { + shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short)); + if (shared && sbi->upcase != shared) { + kvfree(sbi->upcase); sbi->upcase = shared; - kvfree(upcase); } iput(inode); - inode = NULL; if (is_ntfs3(sbi)) { /* Load $Secure. */ @@ -1331,34 +1249,31 @@ load_root: ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load root."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) { - err = -EINVAL; - goto out; + err = -ENOMEM; + goto put_inode_out; } + fc->fs_private = NULL; + return 0; -out: +put_inode_out: iput(inode); - - if (sb->s_root) { - d_drop(sb->s_root); - sb->s_root = NULL; - } - +out: + /* + * Free resources here. + * ntfs_fs_free will be called with fc->s_fs_info = NULL + */ put_ntfs(sbi); - sb->s_fs_info = NULL; + return err; } @@ -1403,7 +1318,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) if (sbi->flags & NTFS_FLAGS_NODISCARD) return -EOPNOTSUPP; - if (!sbi->options.discard) + if (!sbi->options->discard) return -EOPNOTSUPP; lbo = (u64)lcn << sbi->cluster_bits; @@ -1428,19 +1343,99 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) return err; } -static struct dentry *ntfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int ntfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ntfs_fill_super); +} + +/* + * ntfs_fs_free - Free fs_context. + * + * Note that this will be called after fill_super and reconfigure + * even when they pass. So they have to take pointers if they pass. + */ +static void ntfs_fs_free(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); + struct ntfs_mount_options *opts = fc->fs_private; + struct ntfs_sb_info *sbi = fc->s_fs_info; + + if (sbi) + put_ntfs(sbi); + + if (opts) + put_mount_options(opts); +} + +static const struct fs_context_operations ntfs_context_ops = { + .parse_param = ntfs_fs_parse_param, + .get_tree = ntfs_fs_get_tree, + .reconfigure = ntfs_fs_reconfigure, + .free = ntfs_fs_free, +}; + +/* + * ntfs_init_fs_context - Initialize spi and opts + * + * This will called when mount/remount. We will first initiliaze + * options so that if remount we can use just that. + */ +static int ntfs_init_fs_context(struct fs_context *fc) +{ + struct ntfs_mount_options *opts; + struct ntfs_sb_info *sbi; + + opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS); + if (!opts) + return -ENOMEM; + + /* Default options. */ + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask_inv = ~current_umask(); + opts->fs_dmask_inv = ~current_umask(); + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + goto ok; + + sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); + if (!sbi) + goto free_opts; + + sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); + if (!sbi->upcase) + goto free_sbi; + + ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + mutex_init(&sbi->compress.mtx_lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + mutex_init(&sbi->compress.mtx_xpress); + mutex_init(&sbi->compress.mtx_lzx); +#endif + + sbi->options = opts; + fc->s_fs_info = sbi; +ok: + fc->fs_private = opts; + fc->ops = &ntfs_context_ops; + + return 0; +free_sbi: + kfree(sbi); +free_opts: + kfree(opts); + return -ENOMEM; } // clang-format off static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs3", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .owner = THIS_MODULE, + .name = "ntfs3", + .init_fs_context = ntfs_init_fs_context, + .parameters = ntfs_fs_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c index bbeba778237e..b5e8256fd710 100644 --- a/fs/ntfs3/upcase.c +++ b/fs/ntfs3/upcase.c @@ -5,13 +5,9 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/module.h> -#include <linux/nls.h> +#include <linux/kernel.h> +#include <linux/types.h> -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 7282d85c4ece..afd0ddad826f 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -78,6 +75,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, size_t add_bytes, const struct EA_INFO **info) { int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *attr_info, *attr_ea; void *ea_p; @@ -102,10 +100,10 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, /* Check Ea limit. */ size = le32_to_cpu((*info)->size); - if (size > ni->mi.sbi->ea_max_size) + if (size > sbi->ea_max_size) return -EFBIG; - if (attr_size(attr_ea) > ni->mi.sbi->ea_max_size) + if (attr_size(attr_ea) > sbi->ea_max_size) return -EFBIG; /* Allocate memory for packed Ea. */ @@ -113,15 +111,16 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, if (!ea_p) return -ENOMEM; - if (attr_ea->non_res) { + if (!size) { + ; + } else if (attr_ea->non_res) { struct runs_tree run; run_init(&run); err = attr_load_runs(attr_ea, ni, &run, NULL); if (!err) - err = ntfs_read_run_nb(ni->mi.sbi, &run, 0, ea_p, size, - NULL); + err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL); run_close(&run); if (err) @@ -260,7 +259,7 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags, int locked) + size_t val_size, int flags) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -279,8 +278,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, u64 new_sz; void *p; - if (!locked) - ni_lock(ni); + ni_lock(ni); run_init(&ea_run); @@ -370,21 +368,22 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, new_ea->name[name_len] = 0; memcpy(new_ea->name + name_len + 1, value, val_size); new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea); - - /* Should fit into 16 bits. */ - if (new_pack > 0xffff) { - err = -EFBIG; // -EINVAL? - goto out; - } ea_info.size_pack = cpu_to_le16(new_pack); - /* New size of ATTR_EA. */ size += add; - if (size > sbi->ea_max_size) { + ea_info.size = cpu_to_le32(size); + + /* + * 1. Check ea_info.size_pack for overflow. + * 2. New attibute size must fit value from $AttrDef + */ + if (new_pack > 0xffff || size > sbi->ea_max_size) { + ntfs_inode_warn( + inode, + "The size of extended attributes must not exceed 64KiB"); err = -EFBIG; // -EINVAL? goto out; } - ea_info.size = cpu_to_le32(size); update_ea: @@ -444,7 +443,7 @@ update_ea: /* Delete xattr, ATTR_EA */ ni_remove_attr_le(ni, attr, mi, le); } else if (attr->non_res) { - err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size); + err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0); if (err) goto out; } else { @@ -468,8 +467,7 @@ update_ea: mark_inode_dirty(&ni->vfs_inode); out: - if (!locked) - ni_unlock(ni); + ni_unlock(ni); run_close(&ea_run); kfree(ea_all); @@ -478,12 +476,6 @@ out: } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static inline void ntfs_posix_acl_release(struct posix_acl *acl) -{ - if (acl && refcount_dec_and_test(&acl->a_refcount)) - kfree(acl); -} - static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, int type, int locked) @@ -521,12 +513,15 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, /* Translate extended attribute to acl. */ if (err >= 0) { acl = posix_acl_from_xattr(mnt_userns, buf, err); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); + } else if (err == -ENODATA) { + acl = NULL; } else { - acl = err == -ENODATA ? NULL : ERR_PTR(err); + acl = ERR_PTR(err); } + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + __putname(buf); return acl; @@ -546,12 +541,13 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, - int type, int locked) + int type) { const char *name; size_t size, name_len; void *value = NULL; int err = 0; + int flags; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -561,22 +557,15 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, if (acl) { umode_t mode = inode->i_mode; - err = posix_acl_equiv_mode(acl, &mode); - if (err < 0) - return err; + err = posix_acl_update_mode(mnt_userns, inode, &mode, + &acl); + if (err) + goto out; if (inode->i_mode != mode) { inode->i_mode = mode; mark_inode_dirty(inode); } - - if (!err) { - /* - * ACL can be exactly represented in the - * traditional file mode permission bits. - */ - acl = NULL; - } } name = XATTR_NAME_POSIX_ACL_ACCESS; name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; @@ -594,20 +583,24 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, } if (!acl) { + /* Remove xattr if it can be presented via mode. */ size = 0; value = NULL; + flags = XATTR_REPLACE; } else { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(mnt_userns, acl, value, size); if (err < 0) goto out; + flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, 0, locked); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); + if (err == -ENODATA && !size) + err = 0; /* Removing non existed xattr. */ if (!err) set_cached_acl(inode, type, acl); @@ -623,68 +616,7 @@ out: int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type, 0); -} - -static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, void *buffer, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - acl = ntfs_get_acl(inode, type, false); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) - return -ENODATA; - - err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); - ntfs_posix_acl_release(acl); - - return err; -} - -static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, const void *value, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - - if (!value) { - acl = NULL; - } else { - acl = posix_acl_from_xattr(mnt_userns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - err = posix_acl_valid(mnt_userns, acl); - if (err) - goto release_and_out; - } - } - - err = ntfs_set_acl(mnt_userns, inode, acl, type); - -release_and_out: - ntfs_posix_acl_release(acl); - return err; + return ntfs_set_acl_ex(mnt_userns, inode, acl, type); } /* @@ -698,54 +630,27 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *default_acl, *acl; int err; - /* - * TODO: Refactoring lock. - * ni_lock(dir) ... -> posix_acl_create(dir,...) -> ntfs_get_acl -> ni_lock(dir) - */ - inode->i_default_acl = NULL; - - default_acl = ntfs_get_acl_ex(mnt_userns, dir, ACL_TYPE_DEFAULT, 1); - - if (!default_acl || default_acl == ERR_PTR(-EOPNOTSUPP)) { - inode->i_mode &= ~current_umask(); - err = 0; - goto out; - } - - if (IS_ERR(default_acl)) { - err = PTR_ERR(default_acl); - goto out; - } - - acl = default_acl; - err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (err < 0) - goto out1; - if (!err) { - posix_acl_release(acl); - acl = NULL; - } + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; - if (!S_ISDIR(inode->i_mode)) { + if (default_acl) { + err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); - default_acl = NULL; + } else { + inode->i_default_acl = NULL; } - if (default_acl) - err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, - ACL_TYPE_DEFAULT, 1); - if (!acl) inode->i_acl = NULL; - else if (!err) - err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS, - 1); - - posix_acl_release(acl); -out1: - posix_acl_release(default_acl); + else { + if (!err) + err = ntfs_set_acl_ex(mnt_userns, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } -out: return err; } #endif @@ -772,7 +677,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { - if (ntfs_sb(inode->i_sb)->options.no_acs_rules) { + if (ntfs_sb(inode->i_sb)->options->noacsrules) { /* "No access rules" mode - Allow all changes. */ return 0; } @@ -880,23 +785,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - /* TODO: init_user_ns? */ - err = ntfs_xattr_get_acl( - &init_user_ns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - buffer, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -1009,24 +897,8 @@ set_new_fa: goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - err = ntfs_xattr_set_acl( - mnt_userns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - value, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); out: return err; @@ -1042,28 +914,29 @@ int ntfs_save_wsl_perm(struct inode *inode) int err; __le32 value; + /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */ value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f1cc8258d34a..5d9ae17bd443 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7045,7 +7045,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { - int ret, i, has_data, num_pages = 0; + int ret, has_data, num_pages = 0; int need_free = 0; u32 bit_off, num; handle_t *handle; @@ -7054,26 +7054,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_alloc_context *data_ac = NULL; - struct page **pages = NULL; - loff_t end = osb->s_clustersize; + struct page *page = NULL; struct ocfs2_extent_tree et; int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; if (has_data) { - pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); - goto free_pages; + goto out; } } @@ -7093,7 +7084,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - unsigned int page_end; + unsigned int page_end = min_t(unsigned, PAGE_SIZE, + osb->s_clustersize); u64 phys; ret = dquot_alloc_space_nodirty(inode, @@ -7117,15 +7109,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, */ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - /* - * Non sparse file systems zero on extend, so no need - * to do that now. - */ - if (!ocfs2_sparse_alloc(osb) && - PAGE_SIZE < osb->s_clustersize) - end = PAGE_SIZE; - - ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); + ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page, + &num_pages); if (ret) { mlog_errno(ret); need_free = 1; @@ -7136,20 +7121,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * This should populate the 1st page for us and mark * it up to date. */ - ret = ocfs2_read_inline_data(inode, pages[0], di_bh); + ret = ocfs2_read_inline_data(inode, page, di_bh); if (ret) { mlog_errno(ret); need_free = 1; goto out_unlock; } - page_end = PAGE_SIZE; - if (PAGE_SIZE > osb->s_clustersize) - page_end = osb->s_clustersize; - - for (i = 0; i < num_pages; i++) - ocfs2_map_and_dirty_page(inode, handle, 0, page_end, - pages[i], i > 0, &phys); + ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0, + &phys); } spin_lock(&oi->ip_lock); @@ -7180,8 +7160,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_unlock: - if (pages) - ocfs2_unlock_and_free_pages(pages, num_pages); + if (page) + ocfs2_unlock_and_free_pages(&page, num_pages); out_commit: if (ret < 0 && did_quota) @@ -7205,8 +7185,6 @@ out_commit: out: if (data_ac) ocfs2_free_alloc_context(data_ac); -free_pages: - kfree(pages); return ret; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8521942f5af2..481017e1dac5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct journal_head *jh; - int ret; + int ret = 1; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; @@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, if (!buffer_jbd(bg_bh)) return 1; - jh = bh2jh(bg_bh); - spin_lock(&jh->b_state_lock); - bg = (struct ocfs2_group_desc *) jh->b_committed_data; - if (bg) - ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); - else - ret = 1; - spin_unlock(&jh->b_state_lock); + jbd_lock_bh_journal_head(bg_bh); + if (buffer_jbd(bg_bh)) { + jh = bh2jh(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + spin_unlock(&jh->b_state_lock); + } + jbd_unlock_bh_journal_head(bg_bh); return ret; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c86bd4e60e20..5c914ce9b3ac 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2167,11 +2167,17 @@ static int ocfs2_initialize_super(struct super_block *sb, } if (ocfs2_clusterinfo_valid(osb)) { + /* + * ci_stack and ci_cluster in ocfs2_cluster_info may not be null + * terminated, so make sure no overflow happens here by using + * memcpy. Destination strings will always be null terminated + * because osb is allocated using kzalloc. + */ osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - strlcpy(osb->osb_cluster_stack, + memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN + 1); + OCFS2_STACK_LABEL_LEN); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2180,9 +2186,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } - strlcpy(osb->osb_cluster_name, + memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, - OCFS2_CLUSTER_NAME_LEN + 1); + OCFS2_CLUSTER_NAME_LEN); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index c1bb4c4b5d67..e5e3e500ed46 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -10,7 +10,7 @@ * Linux VFS inode operations. */ -#include <linux/bvec.h> +#include <linux/blkdev.h> #include <linux/fileattr.h> #include "protocol.h" #include "orangefs-kernel.h" diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 2f2e430461b2..8bb0a53a658b 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -11,6 +11,7 @@ #include <linux/parser.h> #include <linux/hashtable.h> +#include <linux/seq_file.h> /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ static struct kmem_cache *orangefs_inode_cache; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 1fefb2b8960e..93c7c267de93 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1219,9 +1219,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, goto out_dput; } } else { - if (!d_is_negative(newdentry) && - (!new_opaque || !ovl_is_whiteout(newdentry))) - goto out_dput; + if (!d_is_negative(newdentry)) { + if (!new_opaque || !ovl_is_whiteout(newdentry)) + goto out_dput; + } else { + if (flags & RENAME_EXCHANGE) + goto out_dput; + } } if (olddentry == trap) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index d081faa55e83..ac461a499882 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -272,14 +272,14 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) kmem_cache_free(ovl_aio_request_cachep, aio_req); } -static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) +static void ovl_aio_rw_complete(struct kiocb *iocb, long res) { struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); struct kiocb *orig_iocb = aio_req->orig_iocb; ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res, res2); + orig_iocb->ki_complete(orig_iocb, res); } static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -296,6 +296,12 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, @@ -320,7 +326,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) out: revert_creds(old_cred); ovl_file_accessed(file); - +out_fdput: fdput(real); return ret; @@ -349,6 +355,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) goto out_unlock; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -384,6 +396,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) } out: revert_creds(old_cred); +out_fdput: fdput(real); out_unlock: diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 04ce58c939a0..5d1fbaffd66a 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, static int __register_pstore_blk(struct pstore_device_info *dev, const char *devpath) { - struct inode *inode; int ret = -ENODEV; lockdep_assert_held(&pstore_blk_lock); @@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev, goto err; } - inode = file_inode(psblk_file); - if (!S_ISBLK(inode->i_mode)) { + if (!S_ISBLK(file_inode(psblk_file)->i_mode)) { pr_err("'%s' is not block device!\n", devpath); goto err_fput; } - inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode; - dev->zone.total_size = i_size_read(inode); + dev->zone.total_size = + bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host)); ret = __register_pstore_device(dev); if (ret) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2bcc9a6f1bfc..052f143e2e0e 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -10,6 +10,7 @@ #include <linux/namei.h> #include <linux/slab.h> #include <asm/current.h> +#include <linux/blkdev.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/security.h> diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 65e7e56005b8..e2302342a67f 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -38,6 +38,7 @@ #include <linux/uaccess.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include <linux/seq_file.h> #include "internal.h" struct ramfs_mount_opts { diff --git a/fs/read_write.c b/fs/read_write.c index af057c57bdc6..0074afa7ecb3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t if (unlikely((ssize_t) count < 0)) return -EINVAL; - /* - * ranged mandatory locking does not apply to streams - it makes sense - * only for files where position has a meaning. - */ if (ppos) { loff_t pos = *ppos; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 58481f8d63d5..076f9ab94306 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s, if (!strcmp(arg, "auto")) { /* From JFS code, to auto-get the size. */ - *blocks = - i_size_read(s->s_bdev->bd_inode) >> s-> - s_blocksize_bits; + *blocks = sb_bdev_nr_blocks(s); } else { *blocks = simple_strtoul(arg, &p, 0); if (*p != '\0') { @@ -1986,9 +1984,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) * smaller than the filesystem. If the check fails then abort and * scream, because bad stuff will happen otherwise. */ - if (s->s_bdev && s->s_bdev->bd_inode - && i_size_read(s->s_bdev->bd_inode) < - sb_block_count(rs) * sb_blocksize(rs)) { + if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) { SWARN(silent, s, "", "Filesystem cannot be " "mounted because it is bigger than the device"); SWARN(silent, s, "", "You may need to run fsck " diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 60d6951915f4..bb44ff4c5cc6 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> @@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Check the filesystem does not extend beyond the end of the block device */ msblk->bytes_used = le64_to_cpu(sblk->bytes_used); - if (msblk->bytes_used < 0 || msblk->bytes_used > - i_size_read(sb->s_bdev->bd_inode)) + if (msblk->bytes_used < 0 || + msblk->bytes_used > bdev_nr_bytes(sb->s_bdev)) goto failed_mount; /* Check block size for sanity */ diff --git a/fs/sync.c b/fs/sync.c index 1373a610dc78..3ce8e2137f31 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -3,6 +3,7 @@ * High-level sync()-related operations */ +#include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/file.h> #include <linux/fs.h> @@ -22,25 +23,6 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems - * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to - * submit IO for these buffers via __sync_blockdev(). This also speeds up the - * wait == 1 case since in that case write_inode() functions do - * sync_dirty_buffer() and thus effectively write one block at a time. - */ -static int __sync_filesystem(struct super_block *sb, int wait) -{ - if (wait) - sync_inodes_sb(sb); - else - writeback_inodes_sb(sb, WB_REASON_SYNC); - - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); -} - -/* * Write out and wait upon all dirty data associated with this * superblock. Filesystem data as well as the underlying block * device. Takes the superblock lock. @@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb) if (sb_rdonly(sb)) return 0; - ret = __sync_filesystem(sb, 0); + /* + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have + * to submit I/O for these buffers via sync_blockdev(). This also + * speeds up the wait == 1 case since in that case write_inode() + * methods call sync_dirty_buffer() and thus effectively write one block + * at a time. + */ + writeback_inodes_sb(sb, WB_REASON_SYNC); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 0); + ret = sync_blockdev_nowait(sb->s_bdev); if (ret < 0) return ret; - return __sync_filesystem(sb, 1); + + sync_inodes_sb(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); @@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg) sb->s_op->sync_fs(sb, *(int *)arg); } -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) -{ - filemap_fdatawrite(bdev->bd_inode->i_mapping); -} - -static void fdatawait_one_bdev(struct block_device *bdev, void *arg) -{ - /* - * We keep the error status of individual mapping so that - * applications can catch the writeback error using fsync(2). - * See filemap_fdatawait_keep_errors() for details. - */ - filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); -} - /* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably @@ -114,8 +96,8 @@ void ksys_sync(void) iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); - iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_bdevs(fdatawait_one_bdev, NULL); + sync_bdevs(false); + sync_bdevs(true); if (unlikely(laptop_mode)) laptop_sync_completion(); } @@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work) */ iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); } diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..c57b46a352d8 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = { .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, - .max_namelen = UBIFS_MAX_NLEN, }; diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index f1094cdcd6cd..46d697172197 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb) unsigned long udf_get_last_block(struct super_block *sb) { - struct block_device *bdev = sb->s_bdev; - struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk); + struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; /* @@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb) * Try using the device size... */ if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) - lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits; + lblock = sb_bdev_nr_blocks(sb); if (lblock) return lblock - 1; diff --git a/fs/udf/super.c b/fs/udf/super.c index b2d7c57d0688..34247fba6df9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; - sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + sector_t blocks = sb_bdev_nr_blocks(sb); udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && @@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, int ret; if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= - i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits) + udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb)) return -EAGAIN; bh = udf_read_tagged(sb, block, block, &ident); @@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, last[last_count++] = *lastblock - 152; for (i = 0; i < last_count; i++) { - if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits) + if (last[i] >= sb_bdev_nr_blocks(sb)) continue; ret = udf_check_anchor_block(sb, last[i], fileset); if (ret != -EAGAIN) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 003f0d31743e..22bf14ab2d16 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1827,9 +1827,15 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, if (mode_wp && mode_dontwake) return -EINVAL; - ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp, - &ctx->mmap_changing); + if (mmget_not_zero(ctx->mm)) { + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp, + &ctx->mmap_changing); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (ret) return ret; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7aa943edfc02..62e7fbe4e54c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = { .write_iter = xfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index ddc346a9df9b..3ce5f47338cb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = { .write_iter = zonefs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, }; static struct kmem_cache *zonefs_inode_cachep; diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h index fb172a03a753..20ecb004f5a4 100644 --- a/include/acpi/platform/acgcc.h +++ b/include/acpi/platform/acgcc.h @@ -22,9 +22,14 @@ typedef __builtin_va_list va_list; #define va_arg(v, l) __builtin_va_arg(v, l) #define va_copy(d, s) __builtin_va_copy(d, s) #else +#ifdef __KERNEL__ #include <linux/stdarg.h> -#endif -#endif +#else +/* Used to build acpi tools */ +#include <stdarg.h> +#endif /* __KERNEL__ */ +#endif /* ACPI_USE_BUILTIN_STDARG */ +#endif /* ! va_arg */ #define ACPI_INLINE __inline__ diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 4a674db4e1fa..fedc0dfa4877 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -49,9 +49,15 @@ static inline void flush_cache_page(struct vm_area_struct *vma, static inline void flush_dcache_page(struct page *page) { } + +static inline void flush_dcache_folio(struct folio *folio) { } #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO #endif +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +void flush_dcache_folio(struct folio *folio); +#endif #ifndef flush_dcache_mmap_lock static inline void flush_dcache_mmap_lock(struct address_space *mapping) diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index cc7338f9e0d1..7ce93aaf69f8 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -957,7 +957,7 @@ static inline void __iomem *ioremap(phys_addr_t offset, size_t size) #ifndef iounmap #define iounmap iounmap -static inline void iounmap(void __iomem *addr) +static inline void iounmap(volatile void __iomem *addr) { } #endif diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 73c7139c866f..e715bdb720d5 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -24,7 +24,7 @@ enum arch_timer_reg { ARCH_TIMER_REG_CTRL, - ARCH_TIMER_REG_TVAL, + ARCH_TIMER_REG_CVAL, }; enum arch_timer_ppi_nr { diff --git a/include/kunit/test.h b/include/kunit/test.h index 24b40e5c160b..018e776a34b9 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -613,7 +613,7 @@ void kunit_remove_resource(struct kunit *test, struct kunit_resource *res); * and is automatically cleaned up after the test case concludes. See &struct * kunit_resource for more information. */ -void *kunit_kmalloc_array(struct kunit *test, size_t n, size_t size, gfp_t flags); +void *kunit_kmalloc_array(struct kunit *test, size_t n, size_t size, gfp_t gfp); /** * kunit_kmalloc() - Like kmalloc() except the allocation is *test managed*. @@ -657,9 +657,9 @@ static inline void *kunit_kzalloc(struct kunit *test, size_t size, gfp_t gfp) * * See kcalloc() and kunit_kmalloc_array() for more information. */ -static inline void *kunit_kcalloc(struct kunit *test, size_t n, size_t size, gfp_t flags) +static inline void *kunit_kcalloc(struct kunit *test, size_t n, size_t size, gfp_t gfp) { - return kunit_kmalloc_array(test, n, size, flags | __GFP_ZERO); + return kunit_kmalloc_array(test, n, size, gfp | __GFP_ZERO); } void kunit_cleanup(struct kunit *test); diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 7d1cabe15262..63ccb5252190 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -321,10 +321,20 @@ asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0); * from register 0 to 3 on return from the SMC instruction. An optional * quirk structure provides vendor specific behavior. */ +#ifdef CONFIG_HAVE_ARM_SMCCC asmlinkage void __arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4, unsigned long a5, unsigned long a6, unsigned long a7, struct arm_smccc_res *res, struct arm_smccc_quirk *quirk); +#else +static inline void __arm_smccc_smc(unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, unsigned long a4, + unsigned long a5, unsigned long a6, unsigned long a7, + struct arm_smccc_res *res, struct arm_smccc_quirk *quirk) +{ + *res = (struct arm_smccc_res){}; +} +#endif /** * __arm_smccc_hvc() - make HVC calls diff --git a/include/linux/ata.h b/include/linux/ata.h index 1b44f40c7700..199e47e97d64 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -329,6 +329,7 @@ enum { ATA_LOG_SECURITY = 0x06, ATA_LOG_SATA_SETTINGS = 0x08, ATA_LOG_ZONED_INFORMATION = 0x09, + ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device SATA settings log:*/ ATA_LOG_DEVSLP_OFFSET = 0x30, diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ac7f231b8825..9c14f0a8dbe5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -12,13 +12,13 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> -#include <linux/blkdev.h> #include <linux/device.h> #include <linux/writeback.h> -#include <linux/blk-cgroup.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> +struct blkcg; + static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); @@ -64,7 +64,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) return atomic_long_read(&bdi->tot_write_bandwidth); } -static inline void __add_wb_stat(struct bdi_writeback *wb, +static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); @@ -72,12 +72,12 @@ static inline void __add_wb_stat(struct bdi_writeback *wb, static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - __add_wb_stat(wb, item, 1); + wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - __add_wb_stat(wb, item, -1); + wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) @@ -133,20 +133,7 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb) return test_bit(WB_writeback_running, &wb->state); } -static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) -{ - struct super_block *sb; - - if (!inode) - return &noop_backing_dev_info; - - sb = inode->i_sb; -#ifdef CONFIG_BLOCK - if (sb_is_blkdev_sb(sb)) - return I_BDEV(inode)->bd_disk->bdi; -#endif - return sb->s_bdi; -} +struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 00952e92eae1..fe6bdfbbef66 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -6,19 +6,10 @@ #define __LINUX_BIO_H #include <linux/mempool.h> -#include <linux/ioprio.h> /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include <linux/blk_types.h> #include <linux/uio.h> -#define BIO_DEBUG - -#ifdef BIO_DEBUG -#define BIO_BUG_ON BUG_ON -#else -#define BIO_BUG_ON -#endif - #define BIO_MAX_VECS 256U static inline unsigned int bio_max_segs(unsigned int nr_segs) @@ -78,22 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio) bio_op(bio) == REQ_OP_WRITE_ZEROES; } -static inline bool bio_mergeable(struct bio *bio) -{ - if (bio->bi_opf & REQ_NOMERGE_FLAGS) - return false; - - return true; -} - -static inline unsigned int bio_cur_bytes(struct bio *bio) -{ - if (bio_has_data(bio)) - return bio_iovec(bio).bv_len; - else /* dataless requests such as discard */ - return bio->bi_iter.bi_size; -} - static inline void *bio_data(struct bio *bio) { if (bio_has_data(bio)) @@ -102,25 +77,6 @@ static inline void *bio_data(struct bio *bio) return NULL; } -/** - * bio_full - check if the bio is full - * @bio: bio to check - * @len: length of one segment to be added - * - * Return true if @bio is full and one segment with @len bytes can't be - * added to the bio, otherwise return false - */ -static inline bool bio_full(struct bio *bio, unsigned len) -{ - if (bio->bi_vcnt >= bio->bi_max_vecs) - return true; - - if (bio->bi_iter.bi_size > UINT_MAX - len) - return true; - - return false; -} - static inline bool bio_next_segment(const struct bio *bio, struct bvec_iter_all *iter) { @@ -163,6 +119,28 @@ static inline void bio_advance_iter_single(const struct bio *bio, bvec_iter_advance_single(bio->bi_io_vec, iter, bytes); } +void __bio_advance(struct bio *, unsigned bytes); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +static inline void bio_advance(struct bio *bio, unsigned int nbytes) +{ + if (nbytes == bio->bi_iter.bi_size) { + bio->bi_iter.bi_size = 0; + return; + } + __bio_advance(bio, nbytes); +} + #define __bio_for_each_segment(bvl, bio, iter, start) \ for (iter = (start); \ (iter).bi_size && \ @@ -265,37 +243,6 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit) bio->bi_flags &= ~(1U << bit); } -static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) -{ - *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); -} - -static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) -{ - struct bvec_iter iter = bio->bi_iter; - int idx; - - bio_get_first_bvec(bio, bv); - if (bv->bv_len == bio->bi_iter.bi_size) - return; /* this bio only has a single bvec */ - - bio_advance_iter(bio, &iter, iter.bi_size); - - if (!iter.bi_bvec_done) - idx = iter.bi_idx - 1; - else /* in the middle of bvec */ - idx = iter.bi_idx; - - *bv = bio->bi_io_vec[idx]; - - /* - * iter.bi_bvec_done records actual length of the last bvec - * if this bio ends in the middle of one io vector - */ - if (iter.bi_bvec_done) - bv->bv_len = iter.bi_bvec_done; -} - static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); @@ -424,7 +371,7 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -extern blk_qc_t submit_bio(struct bio *); +void submit_bio(struct bio *bio); extern void bio_endio(struct bio *); @@ -456,8 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) struct request_queue; extern int submit_bio_wait(struct bio *bio); -extern void bio_advance(struct bio *, unsigned); - extern void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs); extern void bio_uninit(struct bio *); @@ -469,12 +414,11 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -void bio_release_pages(struct bio *bio, bool mark_dirty); +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); +void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); @@ -482,27 +426,16 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); -void bio_truncate(struct bio *bio, unsigned new_size); void guard_bio_eod(struct bio *bio); void zero_fill_bio(struct bio *bio); -extern const char *bio_devname(struct bio *bio, char *buffer); +static inline void bio_release_pages(struct bio *bio, bool mark_dirty) +{ + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + __bio_release_pages(bio, mark_dirty); +} -#define bio_set_dev(bio, bdev) \ -do { \ - bio_clear_flag(bio, BIO_REMAPPED); \ - if ((bio)->bi_bdev != (bdev)) \ - bio_clear_flag(bio, BIO_THROTTLED); \ - (bio)->bi_bdev = (bdev); \ - bio_associate_blkg(bio); \ -} while (0) - -#define bio_copy_dev(dst, src) \ -do { \ - bio_clear_flag(dst, BIO_REMAPPED); \ - (dst)->bi_bdev = (src)->bi_bdev; \ - bio_clone_blkg_association(dst, src); \ -} while (0) +extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_dev(bio) \ disk_devt((bio)->bi_bdev->bd_disk) @@ -521,6 +454,22 @@ static inline void bio_clone_blkg_association(struct bio *dst, struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ +static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) +{ + bio_clear_flag(bio, BIO_REMAPPED); + if (bio->bi_bdev != bdev) + bio_clear_flag(bio, BIO_THROTTLED); + bio->bi_bdev = bdev; + bio_associate_blkg(bio); +} + +static inline void bio_copy_dev(struct bio *dst, struct bio *src) +{ + bio_clear_flag(dst, BIO_REMAPPED); + dst->bi_bdev = src->bi_bdev; + bio_clone_blkg_association(dst, src); +} + /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * @@ -784,7 +733,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, */ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) { - bio->bi_opf |= REQ_HIPRI; + bio->bi_opf |= REQ_POLLED; if (!is_sync_kiocb(kiocb)) bio->bi_opf |= REQ_NOWAIT; } diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h new file mode 100644 index 000000000000..bbab65bd5428 --- /dev/null +++ b/include/linux/blk-crypto-profile.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_BLK_CRYPTO_PROFILE_H +#define __LINUX_BLK_CRYPTO_PROFILE_H + +#include <linux/bio.h> +#include <linux/blk-crypto.h> + +struct blk_crypto_profile; + +/** + * struct blk_crypto_ll_ops - functions to control inline encryption hardware + * + * Low-level operations for controlling inline encryption hardware. This + * interface must be implemented by storage drivers that support inline + * encryption. All functions may sleep, are serialized by profile->lock, and + * are never called while profile->dev (if set) is runtime-suspended. + */ +struct blk_crypto_ll_ops { + + /** + * @keyslot_program: Program a key into the inline encryption hardware. + * + * Program @key into the specified @slot in the inline encryption + * hardware, overwriting any key that the keyslot may already contain. + * The keyslot is guaranteed to not be in-use by any I/O. + * + * This is required if the device has keyslots. Otherwise (i.e. if the + * device is a layered device, or if the device is real hardware that + * simply doesn't have the concept of keyslots) it is never called. + * + * Must return 0 on success, or -errno on failure. + */ + int (*keyslot_program)(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot); + + /** + * @keyslot_evict: Evict a key from the inline encryption hardware. + * + * If the device has keyslots, this function must evict the key from the + * specified @slot. The slot will contain @key, but there should be no + * need for the @key argument to be used as @slot should be sufficient. + * The keyslot is guaranteed to not be in-use by any I/O. + * + * If the device doesn't have keyslots itself, this function must evict + * @key from any underlying devices. @slot won't be valid in this case. + * + * If there are no keyslots and no underlying devices, this function + * isn't required. + * + * Must return 0 on success, or -errno on failure. + */ + int (*keyslot_evict)(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot); +}; + +/** + * struct blk_crypto_profile - inline encryption profile for a device + * + * This struct contains a storage device's inline encryption capabilities (e.g. + * the supported crypto algorithms), driver-provided functions to control the + * inline encryption hardware (e.g. programming and evicting keys), and optional + * device-independent keyslot management data. + */ +struct blk_crypto_profile { + + /* public: Drivers must initialize the following fields. */ + + /** + * @ll_ops: Driver-provided functions to control the inline encryption + * hardware, e.g. program and evict keys. + */ + struct blk_crypto_ll_ops ll_ops; + + /** + * @max_dun_bytes_supported: The maximum number of bytes supported for + * specifying the data unit number (DUN). Specifically, the range of + * supported DUNs is 0 through (1 << (8 * max_dun_bytes_supported)) - 1. + */ + unsigned int max_dun_bytes_supported; + + /** + * @modes_supported: Array of bitmasks that specifies whether each + * combination of crypto mode and data unit size is supported. + * Specifically, the i'th bit of modes_supported[crypto_mode] is set if + * crypto_mode can be used with a data unit size of (1 << i). Note that + * only data unit sizes that are powers of 2 can be supported. + */ + unsigned int modes_supported[BLK_ENCRYPTION_MODE_MAX]; + + /** + * @dev: An optional device for runtime power management. If the driver + * provides this device, it will be runtime-resumed before any function + * in @ll_ops is called and will remain resumed during the call. + */ + struct device *dev; + + /* private: The following fields shouldn't be accessed by drivers. */ + + /* Number of keyslots, or 0 if not applicable */ + unsigned int num_slots; + + /* + * Serializes all calls to functions in @ll_ops as well as all changes + * to @slot_hashtable. This can also be taken in read mode to look up + * keyslots while ensuring that they can't be changed concurrently. + */ + struct rw_semaphore lock; + + /* List of idle slots, with least recently used slot at front */ + wait_queue_head_t idle_slots_wait_queue; + struct list_head idle_slots; + spinlock_t idle_slots_lock; + + /* + * Hash table which maps struct *blk_crypto_key to keyslots, so that we + * can find a key's keyslot in O(1) time rather than O(num_slots). + * Protected by 'lock'. + */ + struct hlist_head *slot_hashtable; + unsigned int log_slot_ht_size; + + /* Per-keyslot data */ + struct blk_crypto_keyslot *slots; +}; + +int blk_crypto_profile_init(struct blk_crypto_profile *profile, + unsigned int num_slots); + +int devm_blk_crypto_profile_init(struct device *dev, + struct blk_crypto_profile *profile, + unsigned int num_slots); + +unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot); + +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr); + +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); + +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg); + +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key); + +void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); + +void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); + +void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, + const struct blk_crypto_profile *child); + +bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, + const struct blk_crypto_profile *reference); + +void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, + const struct blk_crypto_profile *src); + +#endif /* __LINUX_BLK_CRYPTO_PROFILE_H */ diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h new file mode 100644 index 000000000000..8a038ea0717e --- /dev/null +++ b/include/linux/blk-integrity.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BLK_INTEGRITY_H +#define _LINUX_BLK_INTEGRITY_H + +#include <linux/blk-mq.h> + +struct request; + +enum blk_integrity_flags { + BLK_INTEGRITY_VERIFY = 1 << 0, + BLK_INTEGRITY_GENERATE = 1 << 1, + BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, + BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, +}; + +struct blk_integrity_iter { + void *prot_buf; + void *data_buf; + sector_t seed; + unsigned int data_size; + unsigned short interval; + const char *disk_name; +}; + +typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); +typedef void (integrity_prepare_fn) (struct request *); +typedef void (integrity_complete_fn) (struct request *, unsigned int); + +struct blk_integrity_profile { + integrity_processing_fn *generate_fn; + integrity_processing_fn *verify_fn; + integrity_prepare_fn *prepare_fn; + integrity_complete_fn *complete_fn; + const char *name; +}; + +#ifdef CONFIG_BLK_DEV_INTEGRITY +void blk_integrity_register(struct gendisk *, struct blk_integrity *); +void blk_integrity_unregister(struct gendisk *); +int blk_integrity_compare(struct gendisk *, struct gendisk *); +int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, + struct scatterlist *); +int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); + +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + struct blk_integrity *bi = &disk->queue->integrity; + + if (!bi->profile) + return NULL; + + return bi; +} + +static inline struct blk_integrity * +bdev_get_integrity(struct block_device *bdev) +{ + return blk_get_integrity(bdev->bd_disk); +} + +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return q->integrity.profile; +} + +static inline void blk_queue_max_integrity_segments(struct request_queue *q, + unsigned int segs) +{ + q->limits.max_integrity_segments = segs; +} + +static inline unsigned short +queue_max_integrity_segments(const struct request_queue *q) +{ + return q->limits.max_integrity_segments; +} + +/** + * bio_integrity_intervals - Return number of integrity intervals for a bio + * @bi: blk_integrity profile for device + * @sectors: Size of the bio in 512-byte sectors + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the data integrity + * interval size of the storage device. Convert the block layer sectors + * to the appropriate number of integrity intervals. + */ +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return sectors >> (bi->interval_exp - 9); +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; +} + +static inline bool blk_integrity_rq(struct request *rq) +{ + return rq->cmd_flags & REQ_INTEGRITY; +} + +/* + * Return the first bvec that contains integrity data. Only drivers that are + * limited to a single integrity segment should use this helper. + */ +static inline struct bio_vec *rq_integrity_vec(struct request *rq) +{ + if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) + return NULL; + return rq->bio->bi_integrity->bip_vec; +} +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline int blk_rq_count_integrity_sg(struct request_queue *q, + struct bio *b) +{ + return 0; +} +static inline int blk_rq_map_integrity_sg(struct request_queue *q, + struct bio *b, + struct scatterlist *s) +{ + return 0; +} +static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) +{ + return NULL; +} +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + return NULL; +} +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return false; +} +static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) +{ + return 0; +} +static inline void blk_integrity_register(struct gendisk *d, + struct blk_integrity *b) +{ +} +static inline void blk_integrity_unregister(struct gendisk *d) +{ +} +static inline void blk_queue_max_integrity_segments(struct request_queue *q, + unsigned int segs) +{ +} +static inline unsigned short +queue_max_integrity_segments(const struct request_queue *q) +{ + return 0; +} + +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} +static inline int blk_integrity_rq(struct request *rq) +{ + return 0; +} + +static inline struct bio_vec *rq_integrity_vec(struct request *rq) +{ + return NULL; +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ +#endif /* _LINUX_BLK_INTEGRITY_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 13ba1861e688..8682663e7368 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -6,10 +6,226 @@ #include <linux/sbitmap.h> #include <linux/srcu.h> #include <linux/lockdep.h> +#include <linux/scatterlist.h> +#include <linux/prefetch.h> struct blk_mq_tags; struct blk_flush_queue; +#define BLKDEV_MIN_RQ 4 +#define BLKDEV_DEFAULT_RQ 128 + +typedef void (rq_end_io_fn)(struct request *, blk_status_t); + +/* + * request flags */ +typedef __u32 __bitwise req_flags_t; + +/* drive already may have started this one */ +#define RQF_STARTED ((__force req_flags_t)(1 << 1)) +/* may not be passed by ioscheduler */ +#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) +/* request for flush sequence */ +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) +/* merge of different types, fail separately */ +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) +/* track inflight for MQ */ +#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) +/* don't call prep for this one */ +#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) +/* vaguely specified driver internal error. Ignored by the block layer */ +#define RQF_FAILED ((__force req_flags_t)(1 << 10)) +/* don't warn about errors */ +#define RQF_QUIET ((__force req_flags_t)(1 << 11)) +/* elevator private data attached */ +#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) +/* account into disk and partition IO statistics */ +#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +/* runtime pm request */ +#define RQF_PM ((__force req_flags_t)(1 << 15)) +/* on IO scheduler merge hash */ +#define RQF_HASHED ((__force req_flags_t)(1 << 16)) +/* track IO completion time */ +#define RQF_STATS ((__force req_flags_t)(1 << 17)) +/* Look at ->special_vec for the actual data payload instead of the + bio chain. */ +#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) +/* The per-zone write lock is held for this request */ +#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* already slept for hybrid poll */ +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) +/* ->timeout has been called, don't expire again */ +#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) +/* queue has elevator attached */ +#define RQF_ELV ((__force req_flags_t)(1 << 22)) + +/* flags that prevent us from merging requests: */ +#define RQF_NOMERGE_FLAGS \ + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) + +enum mq_rq_state { + MQ_RQ_IDLE = 0, + MQ_RQ_IN_FLIGHT = 1, + MQ_RQ_COMPLETE = 2, +}; + +/* + * Try to put the fields that are referenced together in the same cacheline. + * + * If you modify this structure, make sure to update blk_rq_init() and + * especially blk_mq_rq_ctx_init() to take care of the added fields. + */ +struct request { + struct request_queue *q; + struct blk_mq_ctx *mq_ctx; + struct blk_mq_hw_ctx *mq_hctx; + + unsigned int cmd_flags; /* op and common flags */ + req_flags_t rq_flags; + + int tag; + int internal_tag; + + unsigned int timeout; + + /* the following two fields are internal, NEVER access directly */ + unsigned int __data_len; /* total data len */ + sector_t __sector; /* sector cursor */ + + struct bio *bio; + struct bio *biotail; + + union { + struct list_head queuelist; + struct request *rq_next; + }; + + struct gendisk *rq_disk; + struct block_device *part; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + /* Time that the first bio started allocating this request. */ + u64 alloc_time_ns; +#endif + /* Time that this request was allocated for this IO. */ + u64 start_time_ns; + /* Time that I/O was submitted to the device. */ + u64 io_start_time_ns; + +#ifdef CONFIG_BLK_WBT + unsigned short wbt_flags; +#endif + /* + * rq sectors used for blk stats. It has the same value + * with blk_rq_sectors(rq), except that it never be zeroed + * by completion. + */ + unsigned short stats_sectors; + + /* + * Number of scatter-gather DMA addr+len pairs after + * physical address coalescing is performed. + */ + unsigned short nr_phys_segments; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + unsigned short nr_integrity_segments; +#endif + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct bio_crypt_ctx *crypt_ctx; + struct blk_crypto_keyslot *crypt_keyslot; +#endif + + unsigned short write_hint; + unsigned short ioprio; + + enum mq_rq_state state; + refcount_t ref; + + unsigned long deadline; + + /* + * The hash is used inside the scheduler, and killed once the + * request reaches the dispatch list. The ipi_list is only used + * to queue the request for softirq completion, which is long + * after the request has been unhashed (and even removed from + * the dispatch list). + */ + union { + struct hlist_node hash; /* merge hash */ + struct llist_node ipi_list; + }; + + /* + * The rb_node is only used inside the io scheduler, requests + * are pruned when moved to the dispatch queue. So let the + * completion_data share space with the rb_node. + */ + union { + struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; + void *completion_data; + int error_count; /* for legacy drivers, don't use */ + }; + + + /* + * Three pointers are available for the IO schedulers, if they need + * more they have to dynamically allocate it. Flush requests are + * never put on the IO scheduler. So let the flush fields share + * space with the elevator data. + */ + union { + struct { + struct io_cq *icq; + void *priv[2]; + } elv; + + struct { + unsigned int seq; + struct list_head list; + rq_end_io_fn *saved_end_io; + } flush; + }; + + union { + struct __call_single_data csd; + u64 fifo_time; + }; + + /* + * completion callback. + */ + rq_end_io_fn *end_io; + void *end_io_data; +}; + +#define req_op(req) \ + ((req)->cmd_flags & REQ_OP_MASK) + +static inline bool blk_rq_is_passthrough(struct request *rq) +{ + return blk_op_is_passthrough(req_op(rq)); +} + +static inline unsigned short req_get_ioprio(struct request *req) +{ + return req->ioprio; +} + +#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) + +#define rq_dma_dir(rq) \ + (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) + +enum blk_eh_timer_return { + BLK_EH_DONE, /* drivers has completed the command */ + BLK_EH_RESET_TIMER, /* reset timer and try again */ +}; + +#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ +#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ + /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device @@ -126,9 +342,6 @@ struct blk_mq_hw_ctx { unsigned long queued; /** @run: Number of dispatched requests. */ unsigned long run; -#define BLK_MQ_MAX_DISPATCH_ORDER 7 - /** @dispatched: Number of dispatch requests by queue. */ - unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; @@ -148,13 +361,6 @@ struct blk_mq_hw_ctx { /** @kobj: Kernel object for sysfs. */ struct kobject kobj; - /** @poll_considered: Count times blk_poll() was called. */ - unsigned long poll_considered; - /** @poll_invoked: Count how many requests blk_poll() polled. */ - unsigned long poll_invoked; - /** @poll_success: Count how many polled requests were completed. */ - unsigned long poll_success; - #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named @@ -232,13 +438,11 @@ enum hctx_type { * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. - * @active_queues_shared_sbitmap: - * number of active request queues per tag set. - * @__bitmap_tags: A shared tags sbitmap, used over all hctx's - * @__breserved_tags: - * A shared reserved tags sbitmap, used over all hctx's * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. + * @shared_tags: + * Shared set of tags. Has @nr_hw_queues elements. If set, + * shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. @@ -255,12 +459,11 @@ struct blk_mq_tag_set { unsigned int timeout; unsigned int flags; void *driver_data; - atomic_t active_queues_shared_sbitmap; - struct sbitmap_queue __bitmap_tags; - struct sbitmap_queue __breserved_tags; struct blk_mq_tags **tags; + struct blk_mq_tags *shared_tags; + struct mutex tag_list_lock; struct list_head tag_list; }; @@ -330,7 +533,7 @@ struct blk_mq_ops { /** * @poll: Called to poll for completion of a specific tag. */ - int (*poll)(struct blk_mq_hw_ctx *); + int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); /** * @complete: Mark the request as complete. @@ -364,11 +567,6 @@ struct blk_mq_ops { unsigned int); /** - * @initialize_rq_fn: Called from inside blk_get_request(). - */ - void (*initialize_rq_fn)(struct request *rq); - - /** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data. */ @@ -432,6 +630,8 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define BLK_MQ_NO_HCTX_IDX (-1U) + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, queuedata) \ @@ -451,8 +651,6 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); -void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); - void blk_mq_free_request(struct request *rq); bool blk_mq_queue_inflight(struct request_queue *q); @@ -471,7 +669,40 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx); -struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); + +/* + * Tag address space map. + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + + atomic_t active_queues; + + struct sbitmap_queue bitmap_tags; + struct sbitmap_queue breserved_tags; + + struct request **rqs; + struct request **static_rqs; + struct list_head page_list; + + /* + * used to clear request reference in rqs[] before freeing one + * request pool + */ + spinlock_t lock; +}; + +static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, + unsigned int tag) +{ + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); + return tags->rqs[tag]; + } + + return NULL; +} enum { BLK_MQ_UNIQUE_TAG_BITS = 16, @@ -524,6 +755,35 @@ static inline void blk_mq_set_request_complete(struct request *rq) void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); +void blk_mq_end_request_batch(struct io_comp_batch *ib); + +/* + * Only need start/end time stamping if we have iostat or + * blk stats enabled, or using an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV)); +} + +/* + * Batched completions only work when there is no I/O error and no special + * ->end_io handler. + */ +static inline bool blk_mq_add_to_batch(struct request *req, + struct io_comp_batch *iob, int ioerror, + void (*complete)(struct io_comp_batch *)) +{ + if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror) + return false; + if (!iob->complete) + iob->complete = complete; + else if (iob->complete != complete) + return false; + iob->need_ts |= blk_mq_need_time_stamp(req); + rq_list_add(&iob->req_list, req); + return true; +} void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); @@ -605,16 +865,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) -static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - if (rq->tag != -1) - return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); - - return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | - BLK_QC_T_INTERNAL; -} - static inline void blk_mq_cleanup_rq(struct request *rq) { if (rq->q->mq_ops->cleanup_rq) @@ -633,8 +883,265 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->rq_disk = bio->bi_bdev->bd_disk; } -blk_qc_t blk_mq_submit_bio(struct bio *bio); void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); +static inline bool rq_is_sync(struct request *rq) +{ + return op_is_sync(rq->cmd_flags); +} + +void blk_rq_init(struct request_queue *q, struct request *rq); +int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); +void blk_rq_unprep_clone(struct request *rq); +blk_status_t blk_insert_cloned_request(struct request_queue *q, + struct request *rq); + +struct rq_map_data { + struct page **pages; + int page_order; + int nr_entries; + unsigned long offset; + int null_mapped; + int from_user; +}; + +int blk_rq_map_user(struct request_queue *, struct request *, + struct rq_map_data *, void __user *, unsigned long, gfp_t); +int blk_rq_map_user_iov(struct request_queue *, struct request *, + struct rq_map_data *, const struct iov_iter *, gfp_t); +int blk_rq_unmap_user(struct bio *); +int blk_rq_map_kern(struct request_queue *, struct request *, void *, + unsigned int, gfp_t); +int blk_rq_append_bio(struct request *rq, struct bio *bio); +void blk_execute_rq_nowait(struct gendisk *, struct request *, int, + rq_end_io_fn *); +blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, + int at_head); + +struct req_iterator { + struct bvec_iter iter; + struct bio *bio; +}; + +#define __rq_for_each_bio(_bio, rq) \ + if ((rq->bio)) \ + for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) + +#define rq_for_each_segment(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bio_for_each_segment(bvl, _iter.bio, _iter.iter) + +#define rq_for_each_bvec(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bio_for_each_bvec(bvl, _iter.bio, _iter.iter) + +#define rq_iter_last(bvec, _iter) \ + (_iter.bio->bi_next == NULL && \ + bio_iter_last(bvec, _iter.iter)) + +/* + * blk_rq_pos() : the current sector + * blk_rq_bytes() : bytes left in the entire request + * blk_rq_cur_bytes() : bytes left in the current segment + * blk_rq_err_bytes() : bytes left till the next error boundary + * blk_rq_sectors() : sectors left in the entire request + * blk_rq_cur_sectors() : sectors left in the current segment + * blk_rq_stats_sectors() : sectors of the entire request used for stats + */ +static inline sector_t blk_rq_pos(const struct request *rq) +{ + return rq->__sector; +} + +static inline unsigned int blk_rq_bytes(const struct request *rq) +{ + return rq->__data_len; +} + +static inline int blk_rq_cur_bytes(const struct request *rq) +{ + if (!rq->bio) + return 0; + if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ + return rq->bio->bi_iter.bi_size; + return bio_iovec(rq->bio).bv_len; +} + +unsigned int blk_rq_err_bytes(const struct request *rq); + +static inline unsigned int blk_rq_sectors(const struct request *rq) +{ + return blk_rq_bytes(rq) >> SECTOR_SHIFT; +} + +static inline unsigned int blk_rq_cur_sectors(const struct request *rq) +{ + return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; +} + +static inline unsigned int blk_rq_stats_sectors(const struct request *rq) +{ + return rq->stats_sectors; +} + +/* + * Some commands like WRITE SAME have a payload or data transfer size which + * is different from the size of the request. Any driver that supports such + * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to + * calculate the data transfer size. + */ +static inline unsigned int blk_rq_payload_bytes(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + return blk_rq_bytes(rq); +} + +/* + * Return the first full biovec in the request. The caller needs to check that + * there are any bvecs before calling this helper. + */ +static inline struct bio_vec req_bvec(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec; + return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); +} + +static inline unsigned int blk_rq_count_bios(struct request *rq) +{ + unsigned int nr_bios = 0; + struct bio *bio; + + __rq_for_each_bio(bio, rq) + nr_bios++; + + return nr_bios; +} + +void blk_steal_bios(struct bio_list *list, struct request *rq); + +/* + * Request completion related functions. + * + * blk_update_request() completes given number of bytes and updates + * the request without completing it. + */ +bool blk_update_request(struct request *rq, blk_status_t error, + unsigned int nr_bytes); +void blk_abort_request(struct request *); + +/* + * Number of physical segments as sent to the device. + * + * Normally this is the number of discontiguous data segments sent by the + * submitter. But for data-less command like discard we might have no + * actual data segments submitted, but the driver might have to add it's + * own special payload. In that case we still return 1 here so that this + * special payload will be mapped. + */ +static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return 1; + return rq->nr_phys_segments; +} + +/* + * Number of discard segments (or ranges) the driver needs to fill in. + * Each discard bio merged into a request is counted as one segment. + */ +static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) +{ + return max_t(unsigned short, rq->nr_phys_segments, 1); +} + +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist) +{ + struct scatterlist *last_sg = NULL; + + return __blk_rq_map_sg(q, rq, sglist, &last_sg); +} +void blk_dump_rq_flags(struct request *, char *); + +#ifdef CONFIG_BLK_DEV_ZONED +static inline unsigned int blk_rq_zone_no(struct request *rq) +{ + return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); +} + +static inline unsigned int blk_rq_zone_is_seq(struct request *rq) +{ + return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); +} + +bool blk_req_needs_zone_write_lock(struct request *rq); +bool blk_req_zone_write_trylock(struct request *rq); +void __blk_req_zone_write_lock(struct request *rq); +void __blk_req_zone_write_unlock(struct request *rq); + +static inline void blk_req_zone_write_lock(struct request *rq) +{ + if (blk_req_needs_zone_write_lock(rq)) + __blk_req_zone_write_lock(rq); +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) + __blk_req_zone_write_unlock(rq); +} + +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return rq->q->seq_zones_wlock && + test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + if (!blk_req_needs_zone_write_lock(rq)) + return true; + return !blk_req_zone_is_write_locked(rq); +} +#else /* CONFIG_BLK_DEV_ZONED */ +static inline bool blk_req_needs_zone_write_lock(struct request *rq) +{ + return false; +} + +static inline void blk_req_zone_write_lock(struct request *rq) +{ +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ +} +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return false; +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + return true; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" #endif +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void rq_flush_dcache_pages(struct request *rq); +#else +static inline void rq_flush_dcache_pages(struct request *rq) +{ +} +#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ +#endif /* BLK_MQ_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index be622b5a21ed..fe065c394fff 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,8 +20,26 @@ struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; +/* + * The basic unit of block I/O is a sector. It is used in a number of contexts + * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 + * bytes. Variables of type sector_t represent an offset or size that is a + * multiple of 512 bytes. Hence these two constants. + */ +#ifndef SECTOR_SHIFT +#define SECTOR_SHIFT 9 +#endif +#ifndef SECTOR_SIZE +#define SECTOR_SIZE (1 << SECTOR_SHIFT) +#endif + +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + struct block_device { sector_t bd_start_sect; + sector_t bd_nr_sectors; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; bool bd_read_only; /* read-only policy */ @@ -38,6 +56,7 @@ struct block_device { u8 bd_partno; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; + struct request_queue * bd_queue; /* The counter of freeze processes */ int bd_fsfreeze_count; @@ -208,6 +227,9 @@ static inline void bio_issue_init(struct bio_issue *issue, ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } +typedef unsigned int blk_qc_t; +#define BLK_QC_T_NONE -1U + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -227,8 +249,8 @@ struct bio { struct bvec_iter bi_iter; + blk_qc_t bi_cookie; bio_end_io_t *bi_end_io; - void *bi_private; #ifdef CONFIG_BLK_CGROUP /* @@ -384,7 +406,7 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_HIPRI, + __REQ_POLLED, /* caller polls for completion using bio_poll */ /* for driver use */ __REQ_DRV, @@ -409,7 +431,7 @@ enum req_flag_bits { #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) -#define REQ_HIPRI (1ULL << __REQ_HIPRI) +#define REQ_POLLED (1ULL << __REQ_POLLED) #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) @@ -431,8 +453,6 @@ enum stat_group { #define bio_op(bio) \ ((bio)->bi_opf & REQ_OP_MASK) -#define req_op(req) \ - ((req)->cmd_flags & REQ_OP_MASK) /* obsolete, don't use in new code */ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, @@ -497,31 +517,6 @@ static inline int op_stat_group(unsigned int op) return op_is_write(op); } -typedef unsigned int blk_qc_t; -#define BLK_QC_T_NONE -1U -#define BLK_QC_T_SHIFT 16 -#define BLK_QC_T_INTERNAL (1U << 31) - -static inline bool blk_qc_t_valid(blk_qc_t cookie) -{ - return cookie != BLK_QC_T_NONE; -} - -static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -{ - return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; -} - -static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) -{ - return cookie & ((1u << BLK_QC_T_SHIFT) - 1); -} - -static inline bool blk_qc_t_is_internal(blk_qc_t cookie) -{ - return (cookie & BLK_QC_T_INTERNAL) != 0; -} - struct blk_rq_stat { u64 mean; u64 min; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12b9dbcc980e..bd4370baccca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -3,8 +3,6 @@ #define _LINUX_BLKDEV_H #include <linux/sched.h> -#include <linux/sched/clock.h> -#include <linux/major.h> #include <linux/genhd.h> #include <linux/list.h> #include <linux/llist.h> @@ -12,17 +10,11 @@ #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/wait.h> -#include <linux/mempool.h> -#include <linux/pfn.h> #include <linux/bio.h> -#include <linux/stringify.h> #include <linux/gfp.h> -#include <linux/smp.h> #include <linux/rcupdate.h> #include <linux/percpu-refcount.h> -#include <linux/scatterlist.h> #include <linux/blkzoned.h> -#include <linux/pm.h> #include <linux/sbitmap.h> struct module; @@ -33,14 +25,12 @@ struct request; struct sg_io_hdr; struct blkcg_gq; struct blk_flush_queue; +struct kiocb; struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; -struct blk_keyslot_manager; - -#define BLKDEV_MIN_RQ 4 -#define BLKDEV_MAX_RQ 128 /* Default maximum */ +struct blk_crypto_profile; /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -54,186 +44,13 @@ struct blk_keyslot_manager; */ #define BLKCG_MAX_POLS 6 -typedef void (rq_end_io_fn)(struct request *, blk_status_t); - -/* - * request flags */ -typedef __u32 __bitwise req_flags_t; - -/* drive already may have started this one */ -#define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* may not be passed by ioscheduler */ -#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) -/* request for flush sequence */ -#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) -/* merge of different types, fail separately */ -#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) -/* track inflight for MQ */ -#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) -/* don't call prep for this one */ -#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) -/* vaguely specified driver internal error. Ignored by the block layer */ -#define RQF_FAILED ((__force req_flags_t)(1 << 10)) -/* don't warn about errors */ -#define RQF_QUIET ((__force req_flags_t)(1 << 11)) -/* elevator private data attached */ -#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) -/* account into disk and partition IO statistics */ -#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* runtime pm request */ -#define RQF_PM ((__force req_flags_t)(1 << 15)) -/* on IO scheduler merge hash */ -#define RQF_HASHED ((__force req_flags_t)(1 << 16)) -/* track IO completion time */ -#define RQF_STATS ((__force req_flags_t)(1 << 17)) -/* Look at ->special_vec for the actual data payload instead of the - bio chain. */ -#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The per-zone write lock is held for this request */ -#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) -/* already slept for hybrid poll */ -#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) -/* ->timeout has been called, don't expire again */ -#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) - -/* flags that prevent us from merging requests: */ -#define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) - -/* - * Request state for blk-mq. - */ -enum mq_rq_state { - MQ_RQ_IDLE = 0, - MQ_RQ_IN_FLIGHT = 1, - MQ_RQ_COMPLETE = 2, -}; - -/* - * Try to put the fields that are referenced together in the same cacheline. - * - * If you modify this structure, make sure to update blk_rq_init() and - * especially blk_mq_rq_ctx_init() to take care of the added fields. - */ -struct request { - struct request_queue *q; - struct blk_mq_ctx *mq_ctx; - struct blk_mq_hw_ctx *mq_hctx; - - unsigned int cmd_flags; /* op and common flags */ - req_flags_t rq_flags; - - int tag; - int internal_tag; - - /* the following two fields are internal, NEVER access directly */ - unsigned int __data_len; /* total data len */ - sector_t __sector; /* sector cursor */ - - struct bio *bio; - struct bio *biotail; - - struct list_head queuelist; - - /* - * The hash is used inside the scheduler, and killed once the - * request reaches the dispatch list. The ipi_list is only used - * to queue the request for softirq completion, which is long - * after the request has been unhashed (and even removed from - * the dispatch list). - */ - union { - struct hlist_node hash; /* merge hash */ - struct llist_node ipi_list; - }; - - /* - * The rb_node is only used inside the io scheduler, requests - * are pruned when moved to the dispatch queue. So let the - * completion_data share space with the rb_node. - */ - union { - struct rb_node rb_node; /* sort/lookup */ - struct bio_vec special_vec; - void *completion_data; - int error_count; /* for legacy drivers, don't use */ - }; - - /* - * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. Flush requests are - * never put on the IO scheduler. So let the flush fields share - * space with the elevator data. - */ - union { - struct { - struct io_cq *icq; - void *priv[2]; - } elv; - - struct { - unsigned int seq; - struct list_head list; - rq_end_io_fn *saved_end_io; - } flush; - }; - - struct gendisk *rq_disk; - struct block_device *part; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - /* Time that the first bio started allocating this request. */ - u64 alloc_time_ns; -#endif - /* Time that this request was allocated for this IO. */ - u64 start_time_ns; - /* Time that I/O was submitted to the device. */ - u64 io_start_time_ns; - -#ifdef CONFIG_BLK_WBT - unsigned short wbt_flags; -#endif - /* - * rq sectors used for blk stats. It has the same value - * with blk_rq_sectors(rq), except that it never be zeroed - * by completion. - */ - unsigned short stats_sectors; - - /* - * Number of scatter-gather DMA addr+len pairs after - * physical address coalescing is performed. - */ - unsigned short nr_phys_segments; - -#if defined(CONFIG_BLK_DEV_INTEGRITY) - unsigned short nr_integrity_segments; -#endif - -#ifdef CONFIG_BLK_INLINE_ENCRYPTION - struct bio_crypt_ctx *crypt_ctx; - struct blk_ksm_keyslot *crypt_keyslot; -#endif - - unsigned short write_hint; - unsigned short ioprio; - - enum mq_rq_state state; - refcount_t ref; - - unsigned int timeout; - unsigned long deadline; - - union { - struct __call_single_data csd; - u64 fifo_time; - }; +static inline int blk_validate_block_size(unsigned int bsize) +{ + if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + return -EINVAL; - /* - * completion callback. - */ - rq_end_io_fn *end_io; - void *end_io_data; -}; + return 0; +} static inline bool blk_op_is_passthrough(unsigned int op) { @@ -241,35 +58,6 @@ static inline bool blk_op_is_passthrough(unsigned int op) return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } -static inline bool blk_rq_is_passthrough(struct request *rq) -{ - return blk_op_is_passthrough(req_op(rq)); -} - -static inline unsigned short req_get_ioprio(struct request *req) -{ - return req->ioprio; -} - -#include <linux/elevator.h> - -struct blk_queue_ctx; - -struct bio_vec; - -enum blk_eh_timer_return { - BLK_EH_DONE, /* drivers has completed the command */ - BLK_EH_RESET_TIMER, /* reset timer and try again */ -}; - -enum blk_queue_state { - Queue_down, - Queue_up, -}; - -#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ - /* * Zoned block device models (zoned limit). * @@ -370,6 +158,34 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ +/* + * Independent access ranges: struct blk_independent_access_range describes + * a range of contiguous sectors that can be accessed using device command + * execution resources that are independent from the resources used for + * other access ranges. This is typically found with single-LUN multi-actuator + * HDDs where each access range is served by a different set of heads. + * The set of independent ranges supported by the device is defined using + * struct blk_independent_access_ranges. The independent ranges must not overlap + * and must include all sectors within the disk capacity (no sector holes + * allowed). + * For a device with multiple ranges, requests targeting sectors in different + * ranges can be executed in parallel. A request can straddle an access range + * boundary. + */ +struct blk_independent_access_range { + struct kobject kobj; + struct request_queue *queue; + sector_t sector; + sector_t nr_sectors; +}; + +struct blk_independent_access_ranges { + struct kobject kobj; + bool sysfs_registered; + unsigned int nr_ia_ranges; + struct blk_independent_access_range ia_range[]; +}; + struct request_queue { struct request *last_merge; struct elevator_queue *elevator; @@ -444,8 +260,7 @@ struct request_queue { unsigned int dma_alignment; #ifdef CONFIG_BLK_INLINE_ENCRYPTION - /* Inline crypto capabilities */ - struct blk_keyslot_manager *ksm; + struct blk_crypto_profile *crypto_profile; #endif unsigned int rq_timeout; @@ -457,10 +272,9 @@ struct request_queue { struct timer_list timeout; struct work_struct timeout_work; - atomic_t nr_active_requests_shared_sbitmap; + atomic_t nr_active_requests_shared_tags; - struct sbitmap_queue sched_bitmap_tags; - struct sbitmap_queue sched_breserved_tags; + struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP @@ -536,6 +350,8 @@ struct request_queue { */ struct mutex mq_freeze_lock; + int quiesce_depth; + struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set bio_split; @@ -549,10 +365,14 @@ struct request_queue { bool mq_sysfs_init_done; - size_t cmd_size; - #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; + + /* + * Independent sector access ranges. This is always NULL for + * devices that do not have multiple independent access ranges. + */ + struct blk_independent_access_ranges *ia_ranges; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ @@ -579,7 +399,6 @@ struct request_queue { #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ -#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ @@ -613,8 +432,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -#define blk_queue_scsi_passthrough(q) \ - test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) #ifdef CONFIG_BLK_RQ_ALLOC_TIME @@ -638,11 +455,6 @@ extern void blk_clear_pm_only(struct request_queue *q); #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) -#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) - -#define rq_dma_dir(rq) \ - (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) - #define dma_map_bvec(dev, bv, dir, attrs) \ dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) @@ -758,42 +570,6 @@ static inline unsigned int queue_max_active_zones(const struct request_queue *q) } #endif /* CONFIG_BLK_DEV_ZONED */ -static inline bool rq_is_sync(struct request *rq) -{ - return op_is_sync(rq->cmd_flags); -} - -static inline bool rq_mergeable(struct request *rq) -{ - if (blk_rq_is_passthrough(rq)) - return false; - - if (req_op(rq) == REQ_OP_FLUSH) - return false; - - if (req_op(rq) == REQ_OP_WRITE_ZEROES) - return false; - - if (req_op(rq) == REQ_OP_ZONE_APPEND) - return false; - - if (rq->cmd_flags & REQ_NOMERGE_FLAGS) - return false; - if (rq->rq_flags & RQF_NOMERGE_FLAGS) - return false; - - return true; -} - -static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) -{ - if (bio_page(a) == bio_page(b) && - bio_offset(a) == bio_offset(b)) - return true; - - return false; -} - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) @@ -808,83 +584,20 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) -struct rq_map_data { - struct page **pages; - int page_order; - int nr_entries; - unsigned long offset; - int null_mapped; - int from_user; -}; - -struct req_iterator { - struct bvec_iter iter; - struct bio *bio; -}; - /* This should not be used directly - use rq_for_each_segment */ #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) -#define __rq_for_each_bio(_bio, rq) \ - if ((rq->bio)) \ - for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) -#define rq_for_each_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_segment(bvl, _iter.bio, _iter.iter) - -#define rq_for_each_bvec(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_bvec(bvl, _iter.bio, _iter.iter) - -#define rq_iter_last(bvec, _iter) \ - (_iter.bio->bi_next == NULL && \ - bio_iter_last(bvec, _iter.iter)) - -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -#endif -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -extern void rq_flush_dcache_pages(struct request *rq); -#else -static inline void rq_flush_dcache_pages(struct request *rq) -{ -} -#endif extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -blk_qc_t submit_bio_noacct(struct bio *bio); -extern void blk_rq_init(struct request_queue *q, struct request *rq); -extern void blk_put_request(struct request *); -extern struct request *blk_get_request(struct request_queue *, unsigned int op, - blk_mq_req_flags_t flags); +void submit_bio_noacct(struct bio *bio); + extern int blk_lld_busy(struct request_queue *q); -extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data); -extern void blk_rq_unprep_clone(struct request *rq); -extern blk_status_t blk_insert_cloned_request(struct request_queue *q, - struct request *rq); -int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_queue_split(struct bio **); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -extern int blk_rq_map_user(struct request_queue *, struct request *, - struct rq_map_data *, void __user *, unsigned long, - gfp_t); -extern int blk_rq_unmap_user(struct bio *); -extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); -extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct rq_map_data *, const struct iov_iter *, - gfp_t); -extern void blk_execute_rq_nowait(struct gendisk *, - struct request *, int, rq_end_io_fn *); - -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, - int at_head); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(unsigned int op); @@ -892,69 +605,17 @@ extern const char *blk_op_str(unsigned int op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); +/* only poll the hardware once, don't continue until a completion was found */ +#define BLK_POLL_ONESHOT (1 << 0) +/* do not sleep to wait for the expected completion time */ +#define BLK_POLL_NOSLEEP (1 << 1) +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { - return bdev->bd_disk->queue; /* this is never NULL */ -} - -/* - * The basic unit of block I/O is a sector. It is used in a number of contexts - * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 - * bytes. Variables of type sector_t represent an offset or size that is a - * multiple of 512 bytes. Hence these two constants. - */ -#ifndef SECTOR_SHIFT -#define SECTOR_SHIFT 9 -#endif -#ifndef SECTOR_SIZE -#define SECTOR_SIZE (1 << SECTOR_SHIFT) -#endif - -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - -/* - * blk_rq_pos() : the current sector - * blk_rq_bytes() : bytes left in the entire request - * blk_rq_cur_bytes() : bytes left in the current segment - * blk_rq_err_bytes() : bytes left till the next error boundary - * blk_rq_sectors() : sectors left in the entire request - * blk_rq_cur_sectors() : sectors left in the current segment - * blk_rq_stats_sectors() : sectors of the entire request used for stats - */ -static inline sector_t blk_rq_pos(const struct request *rq) -{ - return rq->__sector; -} - -static inline unsigned int blk_rq_bytes(const struct request *rq) -{ - return rq->__data_len; -} - -static inline int blk_rq_cur_bytes(const struct request *rq) -{ - return rq->bio ? bio_cur_bytes(rq->bio) : 0; -} - -extern unsigned int blk_rq_err_bytes(const struct request *rq); - -static inline unsigned int blk_rq_sectors(const struct request *rq) -{ - return blk_rq_bytes(rq) >> SECTOR_SHIFT; -} - -static inline unsigned int blk_rq_cur_sectors(const struct request *rq) -{ - return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; -} - -static inline unsigned int blk_rq_stats_sectors(const struct request *rq) -{ - return rq->stats_sectors; + return bdev->bd_queue; /* this is never NULL */ } #ifdef CONFIG_BLK_DEV_ZONED @@ -973,42 +634,8 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio) return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev), bio->bi_iter.bi_sector); } - -static inline unsigned int blk_rq_zone_no(struct request *rq) -{ - return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); -} - -static inline unsigned int blk_rq_zone_is_seq(struct request *rq) -{ - return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); -} #endif /* CONFIG_BLK_DEV_ZONED */ -/* - * Some commands like WRITE SAME have a payload or data transfer size which - * is different from the size of the request. Any driver that supports such - * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to - * calculate the data transfer size. - */ -static inline unsigned int blk_rq_payload_bytes(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return rq->special_vec.bv_len; - return blk_rq_bytes(rq); -} - -/* - * Return the first full biovec in the request. The caller needs to check that - * there are any bvecs before calling this helper. - */ -static inline struct bio_vec req_bvec(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return rq->special_vec; - return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); -} - static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { @@ -1048,47 +675,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, return min(q->limits.max_sectors, chunk_sectors); } -static inline unsigned int blk_rq_get_max_sectors(struct request *rq, - sector_t offset) -{ - struct request_queue *q = rq->q; - - if (blk_rq_is_passthrough(rq)) - return q->limits.max_hw_sectors; - - if (!q->limits.chunk_sectors || - req_op(rq) == REQ_OP_DISCARD || - req_op(rq) == REQ_OP_SECURE_ERASE) - return blk_queue_get_max_sectors(q, req_op(rq)); - - return min(blk_max_size_offset(q, offset, 0), - blk_queue_get_max_sectors(q, req_op(rq))); -} - -static inline unsigned int blk_rq_count_bios(struct request *rq) -{ - unsigned int nr_bios = 0; - struct bio *bio; - - __rq_for_each_bio(bio, rq) - nr_bios++; - - return nr_bios; -} - -void blk_steal_bios(struct bio_list *list, struct request *rq); - -/* - * Request completion related functions. - * - * blk_update_request() completes given number of bytes and updates - * the request without completing it. - */ -extern bool blk_update_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes); - -extern void blk_abort_request(struct request *); - /* * Access functions for manipulating queue properties */ @@ -1133,46 +719,24 @@ extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); -extern void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features); -extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, - struct device *dev); -/* - * Number of physical segments as sent to the device. - * - * Normally this is the number of discontiguous data segments sent by the - * submitter. But for data-less command like discard we might have no - * actual data segments submitted, but the driver might have to add it's - * own special payload. In that case we still return 1 here so that this - * special payload will be mapped. - */ -static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return 1; - return rq->nr_phys_segments; -} +struct blk_independent_access_ranges * +disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); +void disk_set_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars); /* - * Number of discard segments (or ranges) the driver needs to fill in. - * Each discard bio merged into a request is counted as one segment. + * Elevator features for blk_queue_required_elevator_features: */ -static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) -{ - return max_t(unsigned short, rq->nr_phys_segments, 1); -} - -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg); -static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) -{ - struct scatterlist *last_sg = NULL; +/* Supports zoned block devices sequential write constraint */ +#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) +/* Supports scheduling on multiple hardware queues */ +#define ELEVATOR_F_MQ_AWARE (1U << 1) - return __blk_rq_map_sg(q, rq, sglist, &last_sg); -} -extern void blk_dump_rq_flags(struct request *, char *); +extern void blk_queue_required_elevator_features(struct request_queue *q, + unsigned int features); +extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, + struct device *dev); bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); @@ -1187,19 +751,24 @@ extern void blk_set_queue_dying(struct request_queue *); * as the lock contention for request_queue lock is reduced. * * It is ok not to disable preemption when adding the request to the plug list - * or when attempting a merge, because blk_schedule_flush_list() will only flush - * the plug list when the task sleeps by itself. For details, please see - * schedule() where blk_schedule_flush_plug() is called. + * or when attempting a merge. For details, please see schedule() where + * blk_flush_plug() is called. */ struct blk_plug { - struct list_head mq_list; /* blk-mq requests */ - struct list_head cb_list; /* md requires an unplug callback */ + struct request *mq_list; /* blk-mq requests */ + + /* if ios_left is > 1, we can batch tag/rq allocations */ + struct request *cached_rq; + unsigned short nr_ios; + unsigned short rq_count; + bool multiple_queues; + bool has_elevator; bool nowait; + + struct list_head cb_list; /* md requires an unplug callback */ }; -#define BLK_MAX_REQUEST_COUNT 16 -#define BLK_PLUG_FLUSH_SIZE (128 * 1024) struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); @@ -1211,32 +780,17 @@ struct blk_plug_cb { extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size); extern void blk_start_plug(struct blk_plug *); +extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); -extern void blk_flush_plug_list(struct blk_plug *, bool); - -static inline void blk_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - if (plug) - blk_flush_plug_list(plug, false); -} - -static inline void blk_schedule_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - - if (plug) - blk_flush_plug_list(plug, true); -} +void blk_flush_plug(struct blk_plug *plug, bool from_schedule); static inline bool blk_needs_flush_plug(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; return plug && - (!list_empty(&plug->mq_list) || - !list_empty(&plug->cb_list)); + (plug->mq_list || !list_empty(&plug->cb_list)); } int blkdev_issue_flush(struct block_device *bdev); @@ -1245,23 +799,23 @@ long nr_blockdev_pages(void); struct blk_plug { }; -static inline void blk_start_plug(struct blk_plug *plug) +static inline void blk_start_plug_nr_ios(struct blk_plug *plug, + unsigned short nr_ios) { } -static inline void blk_finish_plug(struct blk_plug *plug) +static inline void blk_start_plug(struct blk_plug *plug) { } -static inline void blk_flush_plug(struct task_struct *task) +static inline void blk_finish_plug(struct blk_plug *plug) { } -static inline void blk_schedule_flush_plug(struct task_struct *task) +static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } - static inline bool blk_needs_flush_plug(struct task_struct *tsk) { return false; @@ -1499,22 +1053,6 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector return offset << SECTOR_SHIFT; } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@ -1628,210 +1166,36 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -enum blk_integrity_flags { - BLK_INTEGRITY_VERIFY = 1 << 0, - BLK_INTEGRITY_GENERATE = 1 << 1, - BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, - BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, -}; - -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; -}; - -typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); -typedef void (integrity_prepare_fn) (struct request *); -typedef void (integrity_complete_fn) (struct request *, unsigned int); - -struct blk_integrity_profile { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - integrity_prepare_fn *prepare_fn; - integrity_complete_fn *complete_fn; - const char *name; -}; - -extern void blk_integrity_register(struct gendisk *, struct blk_integrity *); -extern void blk_integrity_unregister(struct gendisk *); -extern int blk_integrity_compare(struct gendisk *, struct gendisk *); -extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, - struct scatterlist *); -extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); - -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->profile) - return NULL; - - return bi; -} - -static inline -struct blk_integrity *bdev_get_integrity(struct block_device *bdev) -{ - return blk_get_integrity(bdev->bd_disk); -} - -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return q->integrity.profile; -} - -static inline bool blk_integrity_rq(struct request *rq) -{ - return rq->cmd_flags & REQ_INTEGRITY; -} - -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - -static inline unsigned short -queue_max_integrity_segments(const struct request_queue *q) -{ - return q->limits.max_integrity_segments; -} - -/** - * bio_integrity_intervals - Return number of integrity intervals for a bio - * @bi: blk_integrity profile for device - * @sectors: Size of the bio in 512-byte sectors - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the data integrity - * interval size of the storage device. Convert the block layer sectors - * to the appropriate number of integrity intervals. - */ -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return sectors >> (bi->interval_exp - 9); -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return bio_integrity_intervals(bi, sectors) * bi->tuple_size; -} - -/* - * Return the first bvec that contains integrity data. Only drivers that are - * limited to a single integrity segment should use this helper. - */ -static inline struct bio_vec *rq_integrity_vec(struct request *rq) -{ - if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) - return NULL; - return rq->bio->bi_integrity->bip_vec; -} - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -struct bio; -struct block_device; -struct gendisk; -struct blk_integrity; - -static inline int blk_integrity_rq(struct request *rq) -{ - return 0; -} -static inline int blk_rq_count_integrity_sg(struct request_queue *q, - struct bio *b) -{ - return 0; -} -static inline int blk_rq_map_integrity_sg(struct request_queue *q, - struct bio *b, - struct scatterlist *s) -{ - return 0; -} -static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) -{ - return NULL; -} -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) -{ - return NULL; -} -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return false; -} -static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) -{ - return 0; -} -static inline void blk_integrity_register(struct gendisk *d, - struct blk_integrity *b) -{ -} -static inline void blk_integrity_unregister(struct gendisk *d) -{ -} -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} -static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) -{ - return 0; -} - -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return 0; -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return 0; -} - -static inline struct bio_vec *rq_integrity_vec(struct request *rq) -{ - return NULL; -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #ifdef CONFIG_BLK_INLINE_ENCRYPTION -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q); -void blk_ksm_unregister(struct request_queue *q); +void blk_crypto_unregister(struct request_queue *q); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm, - struct request_queue *q) +static inline bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) { return true; } -static inline void blk_ksm_unregister(struct request_queue *q) { } +static inline void blk_crypto_unregister(struct request_queue *q) { } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ +enum blk_unique_id { + /* these match the Designator Types specified in SPC */ + BLK_UID_T10 = 1, + BLK_UID_EUI64 = 2, + BLK_UID_NAA = 3, +}; + +#define NFL4_UFLG_MASK 0x0000003F struct block_device_operations { - blk_qc_t (*submit_bio) (struct bio *bio); + void (*submit_bio)(struct bio *bio); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); @@ -1847,6 +1211,9 @@ struct block_device_operations { int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); char *(*devnode)(struct gendisk *disk, umode_t *mode); + /* returns the length of the identifier or a negative errno: */ + int (*get_unique_id)(struct gendisk *disk, u8 id[16], + enum blk_unique_id id_type); struct module *owner; const struct pr_ops *pr_ops; @@ -1869,60 +1236,6 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -#ifdef CONFIG_BLK_DEV_ZONED -bool blk_req_needs_zone_write_lock(struct request *rq); -bool blk_req_zone_write_trylock(struct request *rq); -void __blk_req_zone_write_lock(struct request *rq); -void __blk_req_zone_write_unlock(struct request *rq); - -static inline void blk_req_zone_write_lock(struct request *rq) -{ - if (blk_req_needs_zone_write_lock(rq)) - __blk_req_zone_write_lock(rq); -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ - if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) - __blk_req_zone_write_unlock(rq); -} - -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return rq->q->seq_zones_wlock && - test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - if (!blk_req_needs_zone_write_lock(rq)) - return true; - return !blk_req_zone_is_write_locked(rq); -} -#else -static inline bool blk_req_needs_zone_write_lock(struct request *rq) -{ - return false; -} - -static inline void blk_req_zone_write_lock(struct request *rq) -{ -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ -} -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return false; -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - return true; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - static inline void blk_wake_io_task(struct task_struct *waiter) { /* @@ -1991,6 +1304,8 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); +int sync_blockdev_nowait(struct block_device *bdev); +void sync_bdevs(bool wait); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1999,10 +1314,54 @@ static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline int sync_blockdev_nowait(struct block_device *bdev) +{ + return 0; +} +static inline void sync_bdevs(bool wait) +{ +} #endif int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); +struct io_comp_batch { + struct request *req_list; + bool need_ts; + void (*complete)(struct io_comp_batch *); +}; + +#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } + +#define rq_list_add(listptr, rq) do { \ + (rq)->rq_next = *(listptr); \ + *(listptr) = rq; \ +} while (0) + +#define rq_list_pop(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) { \ + __req = *(listptr); \ + *(listptr) = __req->rq_next; \ + } \ + __req; \ +}) + +#define rq_list_peek(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) \ + __req = *(listptr); \ + __req; \ +}) + +#define rq_list_for_each(listptr, pos) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \ + +#define rq_list_next(rq) (rq)->rq_next +#define rq_list_empty(list) ((list) == (struct request *) NULL) + #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index a083e15df608..22501a293fa5 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -2,7 +2,7 @@ #ifndef BLKTRACE_H #define BLKTRACE_H -#include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/relay.h> #include <linux/compat.h> #include <uapi/linux/blktrace_api.h> diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 020a7d5bf470..3db6f6c95489 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -929,8 +929,11 @@ struct bpf_array_aux { * stored in the map to make sure that all callers and callees have * the same prog type and JITed flag. */ - enum bpf_prog_type type; - bool jited; + struct { + spinlock_t lock; + enum bpf_prog_type type; + bool jited; + } owner; /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9c81724e4b98..bbe1eefa4c8a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -101,14 +101,14 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) -#ifdef CONFIG_NET -BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) +#ifdef CONFIG_NET +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0e9bdd42dafb..35c25dff651a 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -44,7 +44,7 @@ struct bvec_iter { unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ -}; +} __packed; struct bvec_iter_all { struct bio_vec bv; diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index c4fef00abdf3..0a89f111e00e 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -64,6 +64,7 @@ struct cdrom_device_info { int for_data; int (*exit)(struct cdrom_device_info *); int mrw_mode_page; + __s64 last_media_change_ms; }; struct cdrom_device_ops { diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 4d7fced3a39f..7a14807c9d1a 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -105,7 +105,7 @@ static inline void user_exit_irqoff(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } -static inline bool context_tracking_guest_enter(void) { return false; } +static __always_inline bool context_tracking_guest_enter(void) { return false; } static inline void context_tracking_guest_exit(void) { } #endif /* !CONFIG_CONTEXT_TRACKING */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 832d8a74fa59..991911048857 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -72,6 +72,8 @@ enum cpuhp_state { CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, + /* Must be after CPUHP_MM_VMSTAT_DEAD */ + CPUHP_MM_DEMOTION_DEAD, CPUHP_MM_VMSTAT_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, @@ -240,6 +242,8 @@ enum cpuhp_state { CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + /* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */ + CPUHP_AP_MM_DEMOTION_ONLINE, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_DTPM_CPU_ONLINE, diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 3f49e65169c6..dbb409d77d4f 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -47,8 +47,6 @@ extern int debug_locks_off(void); # define locking_selftest() do { } while (0) #endif -struct task_struct; - #ifdef CONFIG_LOCKDEP extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 114553b487ef..a7df155ea49b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -576,9 +576,9 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *t); /* - * Table keyslot manager functions + * Table blk_crypto_profile functions */ -void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm); +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile); /*----------------------------------------------------------------- * Macros. diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index e1ca2080a1ff..39fefb86780b 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -173,7 +173,7 @@ static inline int dma_resv_lock_slow_interruptible(struct dma_resv *obj, */ static inline bool __must_check dma_resv_trylock(struct dma_resv *obj) { - return ww_mutex_trylock(&obj->lock); + return ww_mutex_trylock(&obj->lock, NULL); } /** diff --git a/include/linux/dsa/mv88e6xxx.h b/include/linux/dsa/mv88e6xxx.h new file mode 100644 index 000000000000..8c3d45eca46b --- /dev/null +++ b/include/linux/dsa/mv88e6xxx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright 2021 NXP + */ + +#ifndef _NET_DSA_TAG_MV88E6XXX_H +#define _NET_DSA_TAG_MV88E6XXX_H + +#include <linux/if_vlan.h> + +#define MV88E6XXX_VID_STANDALONE 0 +#define MV88E6XXX_VID_BRIDGED (VLAN_N_VID - 1) + +#endif diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 435777a0073c..8ae999f587c4 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -5,7 +5,28 @@ #ifndef _NET_DSA_TAG_OCELOT_H #define _NET_DSA_TAG_OCELOT_H +#include <linux/kthread.h> #include <linux/packing.h> +#include <linux/skbuff.h> + +struct ocelot_skb_cb { + struct sk_buff *clone; + unsigned int ptp_class; /* valid only for clones */ + u8 ptp_cmd; + u8 ts_id; +}; + +#define OCELOT_SKB_CB(skb) \ + ((struct ocelot_skb_cb *)((skb)->cb)) + +#define IFH_TAG_TYPE_C 0 +#define IFH_TAG_TYPE_S 1 + +#define IFH_REW_OP_NOOP 0x0 +#define IFH_REW_OP_DSCP 0x1 +#define IFH_REW_OP_ONE_STEP_PTP 0x2 +#define IFH_REW_OP_TWO_STEP_PTP 0x3 +#define IFH_REW_OP_ORIGIN_PTP 0x5 #define OCELOT_TAG_LEN 16 #define OCELOT_SHORT_PREFIX_LEN 4 @@ -140,6 +161,17 @@ * +------+------+------+------+------+------+------+------+ */ +struct felix_deferred_xmit_work { + struct dsa_port *dp; + struct sk_buff *skb; + struct kthread_work work; +}; + +struct felix_port { + void (*xmit_work_fn)(struct kthread_work *work); + struct kthread_worker *xmit_worker; +}; + static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val) { packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); @@ -215,4 +247,21 @@ static inline void ocelot_ifh_set_vid(void *injection, u64 vid) packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0); } +/* Determine the PTP REW_OP to use for injecting the given skb */ +static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) +{ + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; + u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd; + u32 rew_op = 0; + + if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) { + rew_op = ptp_cmd; + rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; + } else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { + rew_op = ptp_cmd; + } + + return rew_op; +} + #endif diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 171106202fe5..9e07079528a5 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -48,6 +48,10 @@ struct sja1105_tagger_data { spinlock_t meta_lock; unsigned long state; u8 ts_id; + /* Used on SJA1110 where meta frames are generated only for + * 2-step TX timestamps + */ + struct sk_buff_head skb_txtstamp_queue; }; struct sja1105_skb_cb { @@ -69,42 +73,24 @@ struct sja1105_port { bool hwts_tx_en; }; -enum sja1110_meta_tstamp { - SJA1110_META_TSTAMP_TX = 0, - SJA1110_META_TSTAMP_RX = 1, -}; - -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) - -void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, - enum sja1110_meta_tstamp dir, u64 tstamp); - -#else +/* Timestamps are in units of 8 ns clock ticks (equivalent to + * a fixed 125 MHz clock). + */ +#define SJA1105_TICK_NS 8 -static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, - u8 ts_id, enum sja1110_meta_tstamp dir, - u64 tstamp) +static inline s64 ns_to_sja1105_ticks(s64 ns) { + return ns / SJA1105_TICK_NS; } -#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */ - -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105) - -extern const struct dsa_switch_ops sja1105_switch_ops; - -static inline bool dsa_port_is_sja1105(struct dsa_port *dp) +static inline s64 sja1105_ticks_to_ns(s64 ticks) { - return dp->ds->ops == &sja1105_switch_ops; + return ticks * SJA1105_TICK_NS; } -#else - static inline bool dsa_port_is_sja1105(struct dsa_port *dp) { - return false; + return true; } -#endif - #endif /* _NET_DSA_SJA1105_H */ diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 2aaa15779d50..957ebec35aad 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -109,7 +109,7 @@ static inline int elf_core_copy_task_fpregs(struct task_struct *t, struct pt_reg #endif } -#if defined(CONFIG_UM) || defined(CONFIG_IA64) +#if (defined(CONFIG_UML) && defined(CONFIG_X86_32)) || defined(CONFIG_IA64) /* * These functions parameterize elf_core_dump in fs/binfmt_elf.c to write out * extra segments containing the gate DSO contents. Dumping its diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 928c411bd509..c58d50451485 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -308,7 +308,7 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src) */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { - ether_addr_copy(dev->dev_addr, addr); + __dev_addr_set(dev, addr, ETH_ALEN); } /** diff --git a/include/linux/filter.h b/include/linux/filter.h index 4a93c12543ee..ef03ff34234d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1051,6 +1051,7 @@ extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; +extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index c12df59d3f5f..3e378b1fb0bc 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -83,9 +83,10 @@ struct fprop_local_percpu { int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); -void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); -void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, - int max_frac); +void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, + long nr); +void __fprop_add_percpu_max(struct fprop_global *p, + struct fprop_local_percpu *pl, int max_frac, long nr); void fprop_fraction_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, unsigned long *numerator, unsigned long *denominator); @@ -96,7 +97,7 @@ void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) unsigned long flags; local_irq_save(flags); - __fprop_inc_percpu(p, pl); + __fprop_add_percpu(p, pl, 1); local_irq_restore(flags); } diff --git a/include/linux/fs.h b/include/linux/fs.h index e7a633353fd2..f3cfca5edc9a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -48,6 +48,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; +struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; @@ -329,16 +330,12 @@ struct kiocb { randomized_struct_fields_start loff_t ki_pos; - void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); + void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ - union { - unsigned int ki_cookie; /* for ->iopoll */ - struct wait_page_queue *ki_waitq; /* for async buffered IO */ - }; - + struct wait_page_queue *ki_waitq; /* for async buffered IO */ randomized_struct_fields_end }; @@ -2075,7 +2072,8 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iopoll)(struct kiocb *kiocb, bool spin); + int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, + unsigned int flags); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); @@ -2498,6 +2496,8 @@ enum file_time_flags { extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); +int inode_update_time(struct inode *inode, struct timespec64 *time, int flags); + static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index e912ed9141d9..91ea9477e9bd 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -118,9 +118,6 @@ struct fscrypt_operations { */ bool (*empty_dir)(struct inode *inode); - /* The filesystem's maximum ciphertext filename length, in bytes */ - unsigned int max_namelen; - /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c68d83c87f83..59eabbc3a36b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,12 +12,10 @@ #include <linux/types.h> #include <linux/kdev_t.h> -#include <linux/rcupdate.h> -#include <linux/slab.h> -#include <linux/percpu-refcount.h> #include <linux/uuid.h> #include <linux/blk_types.h> -#include <asm/local.h> +#include <linux/device.h> +#include <linux/xarray.h> extern const struct device_type disk_type; extern struct device_type part_type; @@ -26,14 +24,6 @@ extern struct class block_class; #define DISK_MAX_PARTS 256 #define DISK_NAME_LEN 32 -#include <linux/major.h> -#include <linux/device.h> -#include <linux/smp.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/workqueue.h> -#include <linux/xarray.h> - #define PARTITION_META_INFO_VOLNAMELTH 64 /* * Enough for the string representation of any kind of UUID plus NULL. @@ -149,6 +139,7 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 +#define GD_DEAD 2 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ @@ -222,6 +213,8 @@ static inline int add_disk(struct gendisk *disk) } extern void del_gendisk(struct gendisk *gp); +void invalidate_disk(struct gendisk *disk); + void set_disk_ro(struct gendisk *disk, bool read_only); static inline int get_disk_ro(struct gendisk *disk) @@ -230,6 +223,11 @@ static inline int get_disk_ro(struct gendisk *disk) test_bit(GD_READ_ONLY, &disk->state); } +static inline int bdev_read_only(struct block_device *bdev) +{ + return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); +} + extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); @@ -247,7 +245,12 @@ static inline sector_t get_start_sect(struct block_device *bdev) static inline sector_t bdev_nr_sectors(struct block_device *bdev) { - return i_size_read(bdev->bd_inode) >> 9; + return bdev->bd_nr_sectors; +} + +static inline loff_t bdev_nr_bytes(struct block_device *bdev) +{ + return bdev_nr_sectors(bdev) << SECTOR_SHIFT; } static inline sector_t get_capacity(struct gendisk *disk) @@ -255,6 +258,12 @@ static inline sector_t get_capacity(struct gendisk *disk) return bdev_nr_sectors(disk->part0); } +static inline u64 sb_bdev_nr_blocks(struct super_block *sb) +{ + return bdev_nr_sectors(sb->s_bdev) >> + (sb->s_blocksize_bits - SECTOR_SHIFT); +} + int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); @@ -290,10 +299,6 @@ bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); void set_capacity(struct gendisk *disk, sector_t size); -/* for drivers/char/raw.c: */ -int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); -long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); - #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 55b2ec1f965a..3745efd21cf6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -520,15 +520,11 @@ static inline void arch_free_page(struct page *page, int order) { } #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif -#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -static inline int arch_make_page_accessible(struct page *page) -{ - return 0; -} -#endif struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); +struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, @@ -570,6 +566,15 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) return __alloc_pages(gfp_mask, order, nid, NULL); } +static inline +struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid)); + + return __folio_alloc(gfp, order, nid, NULL); +} + /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and @@ -586,6 +591,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); +struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); @@ -596,6 +602,10 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } +static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) +{ + return __folio_alloc_node(gfp, order, numa_node_id()); +} #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 4aa1031d3e4c..0a0b2b09b1b8 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -73,6 +73,12 @@ static inline void *kmap_local_page(struct page *page) return __kmap_local_page_prot(page, kmap_prot); } +static inline void *kmap_local_folio(struct folio *folio, size_t offset) +{ + struct page *page = folio_page(folio, offset / PAGE_SIZE); + return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; +} + static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); @@ -171,6 +177,11 @@ static inline void *kmap_local_page(struct page *page) return page_address(page); } +static inline void *kmap_local_folio(struct folio *folio, size_t offset) +{ + return page_address(&folio->page) + offset; +} + static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index b4c49f9cc379..27cdd715c5f9 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -97,6 +97,43 @@ static inline void kmap_flush_unused(void); static inline void *kmap_local_page(struct page *page); /** + * kmap_local_folio - Map a page in this folio for temporary usage + * @folio: The folio containing the page. + * @offset: The byte offset within the folio which identifies the page. + * + * Requires careful handling when nesting multiple mappings because the map + * management is stack based. The unmap has to be in the reverse order of + * the map operation:: + * + * addr1 = kmap_local_folio(folio1, offset1); + * addr2 = kmap_local_folio(folio2, offset2); + * ... + * kunmap_local(addr2); + * kunmap_local(addr1); + * + * Unmapping addr1 before addr2 is invalid and causes malfunction. + * + * Contrary to kmap() mappings the mapping is only valid in the context of + * the caller and cannot be handed to other contexts. + * + * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the + * virtual address of the direct mapping. Only real highmem pages are + * temporarily mapped. + * + * While it is significantly faster than kmap() for the higmem case it + * comes with restrictions about the pointer validity. Only use when really + * necessary. + * + * On HIGHMEM enabled systems mapping a highmem page has the side effect of + * disabling migration in order to keep the virtual address stable across + * preemption. No caller of kmap_local_folio() can rely on this side effect. + * + * Context: Can be invoked from any context. + * Return: The virtual address of @offset. + */ +static inline void *kmap_local_folio(struct folio *folio, size_t offset); + +/** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f123e15d966e..f280f33ff223 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -251,15 +251,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, } /** - * thp_head - Head page of a transparent huge page. - * @page: Any page (tail, head or regular) found in the page cache. - */ -static inline struct page *thp_head(struct page *page) -{ - return compound_head(page); -} - -/** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ @@ -336,12 +327,6 @@ static inline struct list_head *page_deferred_list(struct page *page) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) -static inline struct page *thp_head(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return page; -} - static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 24f8489583ca..63f4ea4dac9b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -313,8 +313,8 @@ int iomap_writepages(struct address_space *mapping, struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); - blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio, - loff_t file_offset); + void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset); }; /* @@ -337,7 +337,6 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags); ssize_t iomap_dio_complete(struct iomap_dio *dio); -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); #ifdef CONFIG_SWAP struct file; diff --git a/include/linux/irq.h b/include/linux/irq.h index c8293c817646..848e1e12c5c6 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -524,9 +524,10 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); +#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); - +#endif void (*irq_suspend)(struct irq_data *data); void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); @@ -606,8 +607,10 @@ struct irqaction; extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); +#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE extern void irq_cpu_online(void); extern void irq_cpu_offline(void); +#endif extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); @@ -1261,6 +1264,7 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); * top-level IRQ handler. */ extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; +asmlinkage void generic_handle_arch_irq(struct pt_regs *regs); #else #ifndef set_handle_irq #define set_handle_irq(handle_irq) \ diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 67351aac65ef..3a091d0710ae 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -14,8 +14,15 @@ #include <linux/acpi.h> #include <linux/module.h> #include <linux/of.h> +#include <linux/of_irq.h> #include <linux/platform_device.h> +/* Undefined on purpose */ +extern of_irq_init_cb_t typecheck_irq_init_cb; + +#define typecheck_irq_init_cb(fn) \ + (__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn) + /* * This macro must be used by the different irqchip drivers to declare * the association between their DT compatible string and their @@ -23,24 +30,27 @@ * * @name: name that must be unique across all IRQCHIP_DECLARE of the * same file. - * @compstr: compatible string of the irqchip driver + * @compat: compatible string of the irqchip driver * @fn: initialization function */ -#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn) +#define IRQCHIP_DECLARE(name, compat, fn) \ + OF_DECLARE_2(irqchip, name, compat, typecheck_irq_init_cb(fn)) extern int platform_irqchip_probe(struct platform_device *pdev); #define IRQCHIP_PLATFORM_DRIVER_BEGIN(drv_name) \ static const struct of_device_id drv_name##_irqchip_match_table[] = { -#define IRQCHIP_MATCH(compat, fn) { .compatible = compat, .data = fn }, +#define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ + .data = typecheck_irq_init_cb(fn), }, #define IRQCHIP_PLATFORM_DRIVER_END(drv_name) \ {}, \ }; \ MODULE_DEVICE_TABLE(of, drv_name##_irqchip_match_table); \ -static struct platform_driver drv_name##_driver = { \ - .probe = platform_irqchip_probe, \ +static struct platform_driver drv_name##_driver = { \ + .probe = IS_ENABLED(CONFIG_IRQCHIP) ? \ + platform_irqchip_probe : NULL, \ .driver = { \ .name = #drv_name, \ .owner = THIS_MODULE, \ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 59aea39785bf..93d270ca0c56 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -168,14 +168,7 @@ int generic_handle_irq(unsigned int irq); * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); - -#ifdef CONFIG_HANDLE_DOMAIN_IRQ -int handle_domain_irq(struct irq_domain *domain, - unsigned int hwirq, struct pt_regs *regs); - -int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, - struct pt_regs *regs); -#endif +int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); #endif /* Test to see if a driver has successfully requested an irq */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2776423a587e..e8696e4a45aa 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -111,8 +111,8 @@ static __always_inline void might_resched(void) #endif /* CONFIG_PREEMPT_* */ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -extern void ___might_sleep(const char *file, int line, int preempt_offset); -extern void __might_sleep(const char *file, int line, int preempt_offset); +extern void __might_resched(const char *file, int line, unsigned int offsets); +extern void __might_sleep(const char *file, int line); extern void __cant_sleep(const char *file, int line, int preempt_offset); extern void __cant_migrate(const char *file, int line); @@ -129,7 +129,7 @@ extern void __cant_migrate(const char *file, int line); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) /** * cant_sleep - annotation for functions that cannot sleep * @@ -168,10 +168,9 @@ extern void __cant_migrate(const char *file, int line); */ # define non_block_end() WARN_ON(current->non_block_count-- == 0) #else - static inline void ___might_sleep(const char *file, int line, - int preempt_offset) { } - static inline void __might_sleep(const char *file, int line, - int preempt_offset) { } + static inline void __might_resched(const char *file, int line, + unsigned int offsets) { } +static inline void __might_sleep(const char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) # define cant_migrate() do { } while (0) diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h deleted file mode 100644 index a27605e2f826..000000000000 --- a/include/linux/keyslot-manager.h +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright 2019 Google LLC - */ - -#ifndef __LINUX_KEYSLOT_MANAGER_H -#define __LINUX_KEYSLOT_MANAGER_H - -#include <linux/bio.h> -#include <linux/blk-crypto.h> - -struct blk_keyslot_manager; - -/** - * struct blk_ksm_ll_ops - functions to manage keyslots in hardware - * @keyslot_program: Program the specified key into the specified slot in the - * inline encryption hardware. - * @keyslot_evict: Evict key from the specified keyslot in the hardware. - * The key is provided so that e.g. dm layers can evict - * keys from the devices that they map over. - * Returns 0 on success, -errno otherwise. - * - * This structure should be provided by storage device drivers when they set up - * a keyslot manager - this structure holds the function ptrs that the keyslot - * manager will use to manipulate keyslots in the hardware. - */ -struct blk_ksm_ll_ops { - int (*keyslot_program)(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot); - int (*keyslot_evict)(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot); -}; - -struct blk_keyslot_manager { - /* - * The struct blk_ksm_ll_ops that this keyslot manager will use - * to perform operations like programming and evicting keys on the - * device - */ - struct blk_ksm_ll_ops ksm_ll_ops; - - /* - * The maximum number of bytes supported for specifying the data unit - * number. - */ - unsigned int max_dun_bytes_supported; - - /* - * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents - * whether a crypto mode and data unit size are supported. The i'th - * bit of crypto_mode_supported[crypto_mode] is set iff a data unit - * size of (1 << i) is supported. We only support data unit sizes - * that are powers of 2. - */ - unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX]; - - /* Device for runtime power management (NULL if none) */ - struct device *dev; - - /* Here onwards are *private* fields for internal keyslot manager use */ - - unsigned int num_slots; - - /* Protects programming and evicting keys from the device */ - struct rw_semaphore lock; - - /* List of idle slots, with least recently used slot at front */ - wait_queue_head_t idle_slots_wait_queue; - struct list_head idle_slots; - spinlock_t idle_slots_lock; - - /* - * Hash table which maps struct *blk_crypto_key to keyslots, so that we - * can find a key's keyslot in O(1) time rather than O(num_slots). - * Protected by 'lock'. - */ - struct hlist_head *slot_hashtable; - unsigned int log_slot_ht_size; - - /* Per-keyslot data */ - struct blk_ksm_keyslot *slots; -}; - -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots); - -int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, - unsigned int num_slots); - -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr); - -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot); - -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot); - -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg); - -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key); - -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); - -void blk_ksm_destroy(struct blk_keyslot_manager *ksm); - -void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, - const struct blk_keyslot_manager *child); - -void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); - -bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, - struct blk_keyslot_manager *ksm_subset); - -void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, - struct blk_keyslot_manager *reference_ksm); - -#endif /* __LINUX_KEYSLOT_MANAGER_H */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 161e8164abcf..a38a5bca1ba5 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -52,7 +52,7 @@ struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); -void ksm_migrate_page(struct page *newpage, struct page *oldpage); +void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -83,7 +83,7 @@ static inline void rmap_walk_ksm(struct page *page, { } -static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) +static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ diff --git a/include/linux/libata.h b/include/linux/libata.h index c0c64f03e107..236ec689056a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -676,6 +676,18 @@ struct ata_ering { struct ata_ering_entry ring[ATA_ERING_SIZE]; }; +struct ata_cpr { + u8 num; + u8 num_storage_elements; + u64 start_lba; + u64 num_lbas; +}; + +struct ata_cpr_log { + u8 nr_cpr; + struct ata_cpr cpr[]; +}; + struct ata_device { struct ata_link *link; unsigned int devno; /* 0 or 1 */ @@ -735,6 +747,9 @@ struct ata_device { u32 zac_zones_optimal_nonseq; u32 zac_zones_max_open; + /* Concurrent positioning ranges */ + struct ata_cpr_log *cpr_log; + /* error history */ int spdn_cnt; /* ering is CLEAR_END, read comment above CLEAR_END */ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 9fe165beb0f9..467b94257105 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -481,23 +481,6 @@ do { \ #endif /* CONFIG_LOCK_STAT */ -#ifdef CONFIG_LOCKDEP - -/* - * On lockdep we dont want the hand-coded irq-enable of - * _raw_*_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ - LOCK_CONTENDED((_lock), (try), (lock)) - -#else /* CONFIG_LOCKDEP */ - -#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ - lockfl((_lock), (flags)) - -#endif /* CONFIG_LOCKDEP */ - #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 3e726ace5c62..d22430840b53 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -21,7 +21,7 @@ enum lockdep_wait_type { LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ #ifdef CONFIG_PROVE_RAW_LOCK_NESTING - LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ + LD_WAIT_CONFIG, /* preemptible in PREEMPT_RT, spinlock_t etc.. */ #else LD_WAIT_CONFIG = LD_WAIT_SPIN, #endif diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3096c9a0ee01..e34bf0cbdf55 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -369,7 +369,7 @@ enum page_memcg_data_flags { #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) -static inline bool PageMemcgKmem(struct page *page); +static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at @@ -384,89 +384,95 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) } /* - * __page_memcg - get the memory cgroup associated with a non-kmem page - * @page: a pointer to the page struct + * __folio_memcg - Get the memory cgroup associated with a non-kmem folio + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages or - * kmem pages. + * against some type of folios, e.g. slab folios or ex-slab folios or + * kmem folios. */ -static inline struct mem_cgroup *__page_memcg(struct page *page) +static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { - unsigned long memcg_data = page->memcg_data; + unsigned long memcg_data = folio->memcg_data; - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* - * __page_objcg - get the object cgroup associated with a kmem page - * @page: a pointer to the page struct + * __folio_objcg - get the object cgroup associated with a kmem folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the object cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the object cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages or - * LRU pages. + * against some type of folios, e.g. slab folios or ex-slab folios or + * LRU folios. */ -static inline struct obj_cgroup *__page_objcg(struct page *page) +static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { - unsigned long memcg_data = page->memcg_data; + unsigned long memcg_data = folio->memcg_data; - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* - * page_memcg - get the memory cgroup associated with a page - * @page: a pointer to the page struct + * folio_memcg - Get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages. + * against some type of folios, e.g. slab folios or ex-slab folios. * - * For a non-kmem page any of the following ensures page and memcg binding + * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * - * - the page lock + * - the folio lock * - LRU isolation * - lock_page_memcg() * - exclusive reference * - * For a kmem page a caller should hold an rcu read lock to protect memcg - * associated with a kmem page from being released. + * For a kmem folio a caller should hold an rcu read lock to protect memcg + * associated with a kmem folio from being released. */ +static inline struct mem_cgroup *folio_memcg(struct folio *folio) +{ + if (folio_memcg_kmem(folio)) + return obj_cgroup_memcg(__folio_objcg(folio)); + return __folio_memcg(folio); +} + static inline struct mem_cgroup *page_memcg(struct page *page) { - if (PageMemcgKmem(page)) - return obj_cgroup_memcg(__page_objcg(page)); - else - return __page_memcg(page); + return folio_memcg(page_folio(page)); } -/* - * page_memcg_rcu - locklessly get the memory cgroup associated with a page - * @page: a pointer to the page struct +/** + * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages. + * against some type of folios, e.g. slab folios or ex-slab folios. + * + * Return: A pointer to the memory cgroup associated with the folio, + * or NULL. */ -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { - unsigned long memcg_data = READ_ONCE(page->memcg_data); + unsigned long memcg_data = READ_ONCE(folio->memcg_data); - VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { @@ -523,17 +529,18 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) #ifdef CONFIG_MEMCG_KMEM /* - * PageMemcgKmem - check if the page has MemcgKmem flag set - * @page: a pointer to the page struct + * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. + * @folio: Pointer to the folio. * - * Checks if the page has MemcgKmem flag set. The caller must ensure that - * the page has an associated memory cgroup. It's not safe to call this function - * against some types of pages, e.g. slab pages. + * Checks if the folio has MemcgKmem flag set. The caller must ensure + * that the folio has an associated memory cgroup. It's not safe to call + * this function against some types of folios, e.g. slab folios. */ -static inline bool PageMemcgKmem(struct page *page) +static inline bool folio_memcg_kmem(struct folio *folio) { - VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); - return page->memcg_data & MEMCG_DATA_KMEM; + VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); + VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio); + return folio->memcg_data & MEMCG_DATA_KMEM; } /* @@ -577,7 +584,7 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #else -static inline bool PageMemcgKmem(struct page *page) +static inline bool folio_memcg_kmem(struct folio *folio) { return false; } @@ -593,6 +600,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #endif +static inline bool PageMemcgKmem(struct page *page) +{ + return folio_memcg_kmem(page_folio(page)); +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -684,26 +696,47 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) page_counter_read(&memcg->memory); } -int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); -static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); + +/** + * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. + * @folio: Folio to charge. + * @mm: mm context of the allocating task. + * @gfp: Reclaim mode. + * + * Try to charge @folio to the memcg that @mm belongs to, reclaiming + * pages according to @gfp if necessary. If @mm is NULL, try to + * charge to the active memcg. + * + * Do not use this for folios allocated for swapin. + * + * Return: 0 on success. Otherwise, an error code is returned. + */ +static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, + gfp_t gfp) { if (mem_cgroup_disabled()) return 0; - return __mem_cgroup_charge(page, mm, gfp_mask); + return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); -void __mem_cgroup_uncharge(struct page *page); -static inline void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct folio *folio); + +/** + * mem_cgroup_uncharge - Uncharge a folio. + * @folio: Folio to uncharge. + * + * Uncharge a folio previously charged with mem_cgroup_charge(). + */ +static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge(page); + __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_list(struct list_head *page_list); @@ -714,7 +747,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); +void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node @@ -753,33 +786,33 @@ out: } /** - * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page - * @page: the page + * folio_lruvec - return lruvec for isolating/putting an LRU folio + * @folio: Pointer to the folio. * - * This function relies on page->mem_cgroup being stable. + * This function relies on folio->mem_cgroup being stable. */ -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec(struct folio *folio) { - pg_data_t *pgdat = page_pgdat(page); - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); - VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page); - return mem_cgroup_lruvec(memcg, pgdat); + VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); + return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); -struct lruvec *lock_page_lruvec(struct page *page); -struct lruvec *lock_page_lruvec_irq(struct page *page); -struct lruvec *lock_page_lruvec_irqsave(struct page *page, +struct lruvec *folio_lruvec_lock(struct folio *folio); +struct lruvec *folio_lruvec_lock_irq(struct folio *folio); +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif @@ -947,6 +980,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); extern bool cgroup_memory_noswap; #endif +void folio_memcg_lock(struct folio *folio); +void folio_memcg_unlock(struct folio *folio); void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); @@ -1115,12 +1150,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_MAX 0 +static inline struct mem_cgroup *folio_memcg(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; } -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); return NULL; @@ -1131,6 +1171,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline bool folio_memcg_kmem(struct folio *folio) +{ + return false; +} + static inline bool PageMemcgKmem(struct page *page) { return false; @@ -1179,8 +1224,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) return false; } -static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +static inline int mem_cgroup_charge(struct folio *folio, + struct mm_struct *mm, gfp_t gfp) { return 0; } @@ -1195,7 +1240,7 @@ static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) { } -static inline void mem_cgroup_uncharge(struct page *page) +static inline void mem_cgroup_uncharge(struct folio *folio) { } @@ -1203,7 +1248,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_migrate(struct page *old, struct page *new) +static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } @@ -1213,14 +1258,14 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, return &pgdat->__lruvec; } -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec(struct folio *folio) { - pg_data_t *pgdat = page_pgdat(page); - + struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } @@ -1250,26 +1295,26 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } -static inline struct lruvec *lock_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } -static inline struct lruvec *lock_page_lruvec_irq(struct page *page) +static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } -static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, +static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; @@ -1356,6 +1401,14 @@ static inline void unlock_page_memcg(struct page *page) { } +static inline void folio_memcg_lock(struct folio *folio) +{ +} + +static inline void folio_memcg_unlock(struct folio *folio) +{ +} + static inline void mem_cgroup_handle_over_high(void) { } @@ -1517,38 +1570,39 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, } /* Test requires a stable page->memcg binding, see page_memcg() */ -static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec) +static inline bool folio_matches_lruvec(struct folio *folio, + struct lruvec *lruvec) { - return lruvec_pgdat(lruvec) == page_pgdat(page) && - lruvec_memcg(lruvec) == page_memcg(page); + return lruvec_pgdat(lruvec) == folio_pgdat(folio) && + lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *relock_page_lruvec_irq(struct page *page, +static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { if (locked_lruvec) { - if (page_matches_lruvec(page, locked_lruvec)) + if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); } - return lock_page_lruvec_irq(page); + return folio_lruvec_lock_irq(folio); } /* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page, +static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec *locked_lruvec, unsigned long *flags) { if (locked_lruvec) { - if (page_matches_lruvec(page, locked_lruvec)) + if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irqrestore(locked_lruvec, *flags); } - return lock_page_lruvec_irqsave(page, flags); + return folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -1558,17 +1612,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); -void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); -static inline void mem_cgroup_track_foreign_dirty(struct page *page, +static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { if (mem_cgroup_disabled()) return; - if (unlikely(&page_memcg(page)->css != wb->memcg_css)) - mem_cgroup_track_foreign_dirty_slowpath(page, wb); + if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); @@ -1588,7 +1642,7 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, { } -static inline void mem_cgroup_track_foreign_dirty(struct page *page, +static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } diff --git a/include/linux/memory.h b/include/linux/memory.h index 7efc0a7c14c9..182c606adb06 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -160,7 +160,10 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, #define register_hotmemory_notifier(nb) register_memory_notifier(nb) #define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) #else -#define hotplug_memory_notifier(fn, pri) ({ 0; }) +static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) +{ + return 0; +} /* These aren't inline functions due to a GCC bug. */ #define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) #define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c8077e936691..0d2aeb9b0f66 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -57,6 +57,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +void folio_migrate_flags(struct folio *newfolio, struct folio *folio); +void folio_migrate_copy(struct folio *newfolio, struct folio *folio); +int folio_migrate_mapping(struct address_space *mapping, + struct folio *newfolio, struct folio *folio, int extra_count); #else static inline void putback_movable_pages(struct list_head *l) {} diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e23417424373..f17d2101af7a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1138,7 +1138,6 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev); int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); -bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3638d09ba77..993204a6c1a1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9475,16 +9475,22 @@ struct mlx5_ifc_pcmr_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; u8 reserved_at_10[0x10]; + u8 entropy_force_cap[0x1]; u8 entropy_calc_cap[0x1]; u8 entropy_gre_calc_cap[0x1]; - u8 reserved_at_23[0x1b]; + u8 reserved_at_23[0xf]; + u8 rx_ts_over_crc_cap[0x1]; + u8 reserved_at_33[0xb]; u8 fcs_cap[0x1]; u8 reserved_at_3f[0x1]; + u8 entropy_force[0x1]; u8 entropy_calc[0x1]; u8 entropy_gre_calc[0x1]; - u8 reserved_at_43[0x1b]; + u8 reserved_at_43[0xf]; + u8 rx_ts_over_crc[0x1]; + u8 reserved_at_53[0xb]; u8 fcs_chk[0x1]; u8 reserved_at_5f[0x1]; }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..40ff114aaf9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,10 +36,7 @@ struct mempolicy; struct anon_vma; struct anon_vma_chain; -struct file_ra_state; struct user_struct; -struct writeback_control; -struct bdi_writeback; struct pt_regs; extern int sysctl_page_lock_unfairness; @@ -216,13 +213,6 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, loff_t *); -/* - * Any attempt to mark this function as static leads to build failure - * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked() - * is referred to by BPF code. This must be visible for error injection. - */ -int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void **shadowp); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -748,13 +738,18 @@ static inline int put_page_testzero(struct page *page) return page_ref_dec_and_test(page); } +static inline int folio_put_testzero(struct folio *folio) +{ + return put_page_testzero(&folio->page); +} + /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ -static inline int get_page_unless_zero(struct page *page) +static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } @@ -907,7 +902,7 @@ void __put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); -void copy_huge_page(struct page *dst, struct page *src); +void folio_copy(struct folio *dst, struct folio *src); /* * Compound pages have a destructor function. Provide a @@ -950,6 +945,20 @@ static inline unsigned int compound_order(struct page *page) return page[1].compound_order; } +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. + */ +static inline unsigned int folio_order(struct folio *folio) +{ + return compound_order(&folio->page); +} + static inline bool hpage_pincount_available(struct page *page) { /* @@ -1131,6 +1140,11 @@ static inline enum zone_type page_zonenum(const struct page *page) return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +static inline enum zone_type folio_zonenum(const struct folio *folio) +{ + return page_zonenum(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { @@ -1200,18 +1214,26 @@ static inline bool is_pci_p2pdma_page(const struct page *page) } /* 127: arbitrary random number, small enough to assemble well */ -#define page_ref_zero_or_close_to_overflow(page) \ - ((unsigned int) page_ref_count(page) + 127u <= 127u) +#define folio_ref_zero_or_close_to_overflow(folio) \ + ((unsigned int) folio_ref_count(folio) + 127u <= 127u) + +/** + * folio_get - Increment the reference count on a folio. + * @folio: The folio. + * + * Context: May be called in any context, as long as you know that + * you have a refcount on the folio. If you do not already have one, + * folio_try_get() may be the right interface for you to use. + */ +static inline void folio_get(struct folio *folio) +{ + VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); + folio_ref_inc(folio); +} static inline void get_page(struct page *page) { - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_refcount. - */ - VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); - page_ref_inc(page); + folio_get(page_folio(page)); } bool __must_check try_grab_page(struct page *page, unsigned int flags); @@ -1228,9 +1250,28 @@ static inline __must_check bool try_get_page(struct page *page) return true; } +/** + * folio_put - Decrement the reference count on a folio. + * @folio: The folio. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put() unless you can be sure that it wasn't the + * last reference. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put(struct folio *folio) +{ + if (folio_put_testzero(folio)) + __put_page(&folio->page); +} + static inline void put_page(struct page *page) { - page = compound_head(page); + struct folio *folio = page_folio(page); /* * For devmap managed pages we need to catch refcount transition from @@ -1238,13 +1279,12 @@ static inline void put_page(struct page *page) * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (page_is_devmap_managed(&folio->page)) { + put_devmap_managed_page(&folio->page); return; } - if (put_page_testzero(page)) - __put_page(page); + folio_put(folio); } /* @@ -1379,6 +1419,11 @@ static inline int page_to_nid(const struct page *page) } #endif +static inline int folio_nid(const struct folio *folio) +{ + return page_to_nid(&folio->page); +} + #ifdef CONFIG_NUMA_BALANCING static inline int cpu_pid_to_cpupid(int cpu, int pid) { @@ -1546,6 +1591,16 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } +static inline struct zone *folio_zone(const struct folio *folio) +{ + return page_zone(&folio->page); +} + +static inline pg_data_t *folio_pgdat(const struct folio *folio) +{ + return page_pgdat(&folio->page); +} + #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { @@ -1559,6 +1614,20 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/** + * folio_pfn - Return the Page Frame Number of a folio. + * @folio: The folio. + * + * A folio may contain multiple pages. The pages have consecutive + * Page Frame Numbers. + * + * Return: The Page Frame Number of the first page in the folio. + */ +static inline unsigned long folio_pfn(struct folio *folio) +{ + return page_to_pfn(&folio->page); +} + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) @@ -1595,6 +1664,89 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } +/** + * folio_nr_pages - The number of pages in the folio. + * @folio: The folio. + * + * Return: A positive power of two. + */ +static inline long folio_nr_pages(struct folio *folio) +{ + return compound_nr(&folio->page); +} + +/** + * folio_next - Move to the next physical folio. + * @folio: The folio we're currently operating on. + * + * If you have physically contiguous memory which may span more than + * one folio (eg a &struct bio_vec), use this function to move from one + * folio to the next. Do not use it if the memory is only virtually + * contiguous as the folios are almost certainly not adjacent to each + * other. This is the folio equivalent to writing ``page++``. + * + * Context: We assume that the folios are refcounted and/or locked at a + * higher level and do not adjust the reference counts. + * Return: The next struct folio. + */ +static inline struct folio *folio_next(struct folio *folio) +{ + return (struct folio *)folio_page(folio, folio_nr_pages(folio)); +} + +/** + * folio_shift - The size of the memory described by this folio. + * @folio: The folio. + * + * A folio represents a number of bytes which is a power-of-two in size. + * This function tells you which power-of-two the folio is. See also + * folio_size() and folio_order(). + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The base-2 logarithm of the size of this folio. + */ +static inline unsigned int folio_shift(struct folio *folio) +{ + return PAGE_SHIFT + folio_order(folio); +} + +/** + * folio_size - The number of bytes in a folio. + * @folio: The folio. + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The number of bytes in this folio. + */ +static inline size_t folio_size(struct folio *folio) +{ + return PAGE_SIZE << folio_order(folio); +} + +#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +static inline int arch_make_page_accessible(struct page *page) +{ + return 0; +} +#endif + +#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE +static inline int arch_make_folio_accessible(struct folio *folio) +{ + int ret; + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + ret = arch_make_page_accessible(folio_page(folio, i)); + if (ret) + break; + } + + return ret; +} +#endif + /* * Some inline functions in vmstat.h depend on page_zone() */ @@ -1635,19 +1787,6 @@ void page_address_init(void); extern void *page_rmapping(struct page *page); extern struct anon_vma *page_anon_vma(struct page *page); -extern struct address_space *page_mapping(struct page *page); - -extern struct address_space *__page_file_mapping(struct page *); - -static inline -struct address_space *page_file_mapping(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_mapping(page); - - return page->mapping; -} - extern pgoff_t __page_file_index(struct page *page); /* @@ -1662,7 +1801,7 @@ static inline pgoff_t page_index(struct page *page) } bool page_mapped(struct page *page); -struct address_space *page_mapping(struct page *page); +bool folio_mapped(struct folio *folio); /* * Return true only if the page has been allocated with @@ -1700,6 +1839,7 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) +#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Flags passed to show_mem() and show_free_areas() to suppress output in @@ -1854,20 +1994,9 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); -void account_page_cleaned(struct page *page, struct address_space *mapping, - struct bdi_writeback *wb); -int set_page_dirty(struct page *page); +bool folio_mark_dirty(struct folio *folio); +bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void __cancel_dirty_page(struct page *page); -static inline void cancel_dirty_page(struct page *page) -{ - /* Avoid atomic ops, locking, etc. when not actually needed. */ - if (PageDirty(page)) - __cancel_dirty_page(page); -} -int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); @@ -2659,10 +2788,6 @@ extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); -/* mm/page-writeback.c */ -int __must_check write_one_page(struct page *page); -void task_dirty_inc(struct task_struct *tsk); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 355ea1ee32bd..e2ec68b0515c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -6,27 +6,33 @@ #include <linux/swap.h> /** - * page_is_file_lru - should the page be on a file LRU or anon LRU? - * @page: the page to test - * - * Returns 1 if @page is a regular filesystem backed page cache page or a lazily - * freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal - * anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by - * functions that manipulate the LRU lists, to sort a page onto the right LRU - * list. + * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? + * @folio: The folio to test. * * We would like to get this info without a page flag, but the state - * needs to survive until the page is last deleted from the LRU, which + * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. + * + * Return: An integer (not a boolean!) used to sort a folio onto the + * right LRU list and to account folios correctly. + * 1 if @folio is a regular filesystem backed page cache folio + * or a lazily freed anonymous folio (e.g. via MADV_FREE). + * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise + * ram or swap backed folio. */ +static inline int folio_is_file_lru(struct folio *folio) +{ + return !folio_test_swapbacked(folio); +} + static inline int page_is_file_lru(struct page *page) { - return !PageSwapBacked(page); + return folio_is_file_lru(page_folio(page)); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, - int nr_pages) + long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -39,69 +45,94 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, } /** - * __clear_page_lru_flags - clear page lru flags before releasing a page - * @page: the page that was on lru and now has a zero reference + * __folio_clear_lru_flags - Clear page lru flags before releasing a page. + * @folio: The folio that was on lru and now has a zero reference. */ -static __always_inline void __clear_page_lru_flags(struct page *page) +static __always_inline void __folio_clear_lru_flags(struct folio *folio) { - VM_BUG_ON_PAGE(!PageLRU(page), page); + VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); - __ClearPageLRU(page); + __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ - if (PageActive(page) && PageUnevictable(page)) + if (folio_test_active(folio) && folio_test_unevictable(folio)) return; - __ClearPageActive(page); - __ClearPageUnevictable(page); + __folio_clear_active(folio); + __folio_clear_unevictable(folio); +} + +static __always_inline void __clear_page_lru_flags(struct page *page) +{ + __folio_clear_lru_flags(page_folio(page)); } /** - * page_lru - which LRU list should a page be on? - * @page: the page to test + * folio_lru_list - Which LRU list should a folio be on? + * @folio: The folio to test. * - * Returns the LRU list a page should be on, as an index + * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ -static __always_inline enum lru_list page_lru(struct page *page) +static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; - lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; - if (PageActive(page)) + lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; + if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } +static __always_inline +void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) +{ + enum lru_list lru = folio_lru_list(folio); + + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + list_add(&folio->lru, &lruvec->lists[lru]); +} + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec) { - enum lru_list lru = page_lru(page); + lruvec_add_folio(lruvec, page_folio(page)); +} - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); +static __always_inline +void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) +{ + enum lru_list lru = folio_lru_list(folio); + + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list_tail(struct page *page, struct lruvec *lruvec) { - enum lru_list lru = page_lru(page); + lruvec_add_folio_tail(lruvec, page_folio(page)); +} - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); +static __always_inline +void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) +{ + list_del(&folio->lru); + update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + -folio_nr_pages(folio)); } static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec) { - list_del(&page->lru); - update_lru_size(lruvec, page_lru(page), page_zonenum(page), - -thp_nr_pages(page)); + lruvec_del_folio(lruvec, page_folio(page)); } #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e9672de22cf2..f92686cf2cdb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,6 +240,72 @@ struct page { #endif } _struct_page_alignment; +/** + * struct folio - Represents a contiguous set of bytes. + * @flags: Identical to the page flags. + * @lru: Least Recently Used list; tracks how recently this folio was used. + * @mapping: The file this page belongs to, or refers to the anon_vma for + * anonymous memory. + * @index: Offset within the file, in units of pages. For anonymous memory, + * this is the index from the beginning of the mmap. + * @private: Filesystem per-folio data (see folio_attach_private()). + * Used for swp_entry_t if folio_test_swapcache(). + * @_mapcount: Do not access this member directly. Use folio_mapcount() to + * find out how many times this folio is mapped by userspace. + * @_refcount: Do not access this member directly. Use folio_ref_count() + * to find how many references there are to this folio. + * @memcg_data: Memory Control Group data. + * + * A folio is a physically, virtually and logically contiguous set + * of bytes. It is a power-of-two in size, and it is aligned to that + * same power-of-two. It is at least as large as %PAGE_SIZE. If it is + * in the page cache, it is at a file offset which is a multiple of that + * power-of-two. It may be mapped into userspace at an address which is + * at an arbitrary page offset, but its kernel virtual address is aligned + * to its size. + */ +struct folio { + /* private: don't document the anon union */ + union { + struct { + /* public: */ + unsigned long flags; + struct list_head lru; + struct address_space *mapping; + pgoff_t index; + void *private; + atomic_t _mapcount; + atomic_t _refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif + /* private: the union with struct page is transitional */ + }; + struct page page; + }; +}; + +static_assert(sizeof(struct page) == sizeof(struct folio)); +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) +FOLIO_MATCH(flags, flags); +FOLIO_MATCH(lru, lru); +FOLIO_MATCH(compound_head, lru); +FOLIO_MATCH(index, index); +FOLIO_MATCH(private, private); +FOLIO_MATCH(_mapcount, _mapcount); +FOLIO_MATCH(_refcount, _refcount); +#ifdef CONFIG_MEMCG +FOLIO_MATCH(memcg_data, memcg_data); +#endif +#undef FOLIO_MATCH + +static inline atomic_t *folio_mapcount_ptr(struct folio *folio) +{ + struct page *tail = &folio->page + 1; + return &tail->compound_mapcount; +} + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; @@ -258,6 +324,12 @@ static inline atomic_t *compound_pincount_ptr(struct page *page) #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) +/* + * page_private can be used on tail pages. However, PagePrivate is only + * checked by the VM on the head page. So page_private on the tail pages + * should be used for data that's ancillary to the head page (eg attaching + * buffer heads to tail pages after attaching buffer heads to the head page) + */ #define page_private(page) ((page)->private) static inline void set_page_private(struct page *page, unsigned long private) @@ -265,6 +337,11 @@ static inline void set_page_private(struct page *page, unsigned long private) page->private = private; } +static inline void *folio_get_private(struct folio *folio) +{ + return folio->private; +} + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0c0c9a0fdf57..52eae8c45b8d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -15,7 +15,7 @@ #include <linux/mmc/card.h> #include <linux/mmc/pm.h> #include <linux/dma-direction.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> struct mmc_ios { unsigned int clock; /* clock rate */ @@ -492,7 +492,7 @@ struct mmc_host { /* Inline encryption support */ #ifdef CONFIG_MMC_CRYPTO - struct blk_keyslot_manager ksm; + struct blk_crypto_profile crypto_profile; #endif /* Host Software Queue support */ diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 1935d4c72d10..d7285f8148a3 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -22,6 +22,13 @@ void dump_mm(const struct mm_struct *mm); BUG(); \ } \ } while (0) +#define VM_BUG_ON_FOLIO(cond, folio) \ + do { \ + if (unlikely(cond)) { \ + dump_page(&folio->page, "VM_BUG_ON_FOLIO(" __stringify(cond)")");\ + BUG(); \ + } \ + } while (0) #define VM_BUG_ON_VMA(cond, vma) \ do { \ if (unlikely(cond)) { \ @@ -47,6 +54,17 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + dump_page(&folio->page, "VM_WARN_ON_ONCE_FOLIO(" __stringify(cond)")");\ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -55,11 +73,13 @@ void dump_mm(const struct mm_struct *mm); #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) +#define VM_BUG_ON_FOLIO(cond, folio) VM_BUG_ON(cond) #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond) #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d6a4158a9a6..12c4177f7703 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -22,6 +22,7 @@ * Overload PG_private_2 to give us PG_fscache - this is used to indicate that * a page is currently backed by a local disk cache */ +#define folio_test_fscache(folio) folio_test_private_2(folio) #define PageFsCache(page) PagePrivate2((page)) #define SetPageFsCache(page) SetPagePrivate2((page)) #define ClearPageFsCache(page) ClearPagePrivate2((page)) @@ -29,60 +30,80 @@ #define TestClearPageFsCache(page) TestClearPagePrivate2((page)) /** - * set_page_fscache - Set PG_fscache on a page and take a ref - * @page: The page. + * folio_start_fscache - Start an fscache write on a folio. + * @folio: The folio. * - * Set the PG_fscache (PG_private_2) flag on a page and take the reference - * needed for the VM to handle its lifetime correctly. This sets the flag and - * takes the reference unconditionally, so care must be taken not to set the - * flag again if it's already set. + * Call this function before writing a folio to a local cache. Starting a + * second write before the first one finishes is not allowed. */ -static inline void set_page_fscache(struct page *page) +static inline void folio_start_fscache(struct folio *folio) { - set_page_private_2(page); + VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio); + folio_get(folio); + folio_set_private_2(folio); } /** - * end_page_fscache - Clear PG_fscache and release any waiters - * @page: The page - * - * Clear the PG_fscache (PG_private_2) bit on a page and wake up any sleepers - * waiting for this. The page ref held for PG_private_2 being set is released. + * folio_end_fscache - End an fscache write on a folio. + * @folio: The folio. * - * This is, for example, used when a netfs page is being written to a local - * disk cache, thereby allowing writes to the cache for the same page to be - * serialised. + * Call this function after the folio has been written to the local cache. + * This will wake any sleepers waiting on this folio. */ -static inline void end_page_fscache(struct page *page) +static inline void folio_end_fscache(struct folio *folio) { - end_page_private_2(page); + folio_end_private_2(folio); } /** - * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page - * @page: The page to wait on + * folio_wait_fscache - Wait for an fscache write on this folio to end. + * @folio: The folio. * - * Wait for PG_fscache (aka PG_private_2) to be cleared on a page. + * If this folio is currently being written to a local cache, wait for + * the write to finish. Another write may start after this one finishes, + * unless the caller holds the folio lock. */ -static inline void wait_on_page_fscache(struct page *page) +static inline void folio_wait_fscache(struct folio *folio) { - wait_on_page_private_2(page); + folio_wait_private_2(folio); } /** - * wait_on_page_fscache_killable - Wait for PG_fscache to be cleared on a page - * @page: The page to wait on + * folio_wait_fscache_killable - Wait for an fscache write on this folio to end. + * @folio: The folio. * - * Wait for PG_fscache (aka PG_private_2) to be cleared on a page or until a - * fatal signal is received by the calling task. + * If this folio is currently being written to a local cache, wait + * for the write to finish or for a fatal signal to be received. + * Another write may start after this one finishes, unless the caller + * holds the folio lock. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ +static inline int folio_wait_fscache_killable(struct folio *folio) +{ + return folio_wait_private_2_killable(folio); +} + +static inline void set_page_fscache(struct page *page) +{ + folio_start_fscache(page_folio(page)); +} + +static inline void end_page_fscache(struct page *page) +{ + folio_end_private_2(page_folio(page)); +} + +static inline void wait_on_page_fscache(struct page *page) +{ + folio_wait_private_2(page_folio(page)); +} + static inline int wait_on_page_fscache_killable(struct page *page) { - return wait_on_page_private_2_killable(page); + return folio_wait_private_2_killable(page_folio(page)); } enum netfs_read_source { diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 2a38f2b477a5..cb909edb76c4 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -7,6 +7,7 @@ #define _NVME_FC_DRIVER_H 1 #include <linux/scatterlist.h> +#include <linux/blk-mq.h> /* @@ -497,6 +498,8 @@ struct nvme_fc_port_template { int (*xmt_ls_rsp)(struct nvme_fc_local_port *localport, struct nvme_fc_remote_port *rport, struct nvmefc_ls_rsp *ls_rsp); + void (*map_queues)(struct nvme_fc_local_port *localport, + struct blk_mq_queue_map *map); u32 max_hw_queues; u16 max_sgl_segments; @@ -779,6 +782,10 @@ struct nvmet_fc_target_port { * LS received. * Entrypoint is Mandatory. * + * @map_queues: This functions lets the driver expose the queue mapping + * to the block layer. + * Entrypoint is Optional. + * * @fcp_op: Called to perform a data transfer or transmit a response. * The nvmefc_tgt_fcp_req structure is the same LLDD-supplied * exchange structure specified in the nvmet_fc_rcv_fcp_req() call diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 3ec8e50efa16..4dd7e6fe92fb 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -6,6 +6,8 @@ #ifndef _LINUX_NVME_RDMA_H #define _LINUX_NVME_RDMA_H +#define NVME_RDMA_MAX_QUEUE_SIZE 128 + enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, }; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b7c4c4130b65..855dd9b3e84b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -27,8 +27,20 @@ #define NVME_NSID_ALL 0xffffffff enum nvme_subsys_type { - NVME_NQN_DISC = 1, /* Discovery type target subsystem */ - NVME_NQN_NVME = 2, /* NVME type target subsystem */ + /* Referral to another discovery type target subsystem */ + NVME_NQN_DISC = 1, + + /* NVME type target subsystem */ + NVME_NQN_NVME = 2, + + /* Current discovery type target subsystem */ + NVME_NQN_CURR = 3, +}; + +enum nvme_ctrl_type { + NVME_CTRL_IO = 1, /* I/O controller */ + NVME_CTRL_DISC = 2, /* Discovery controller */ + NVME_CTRL_ADMIN = 3, /* Administrative controller */ }; /* Address Family codes for Discovery Log Page entry ADRFAM field */ @@ -244,7 +256,9 @@ struct nvme_id_ctrl { __le32 rtd3e; __le32 oaes; __le32 ctratt; - __u8 rsvd100[28]; + __u8 rsvd100[11]; + __u8 cntrltype; + __u8 fguid[16]; __le16 crdt1; __le16 crdt2; __le16 crdt3; @@ -312,6 +326,7 @@ struct nvme_id_ctrl { }; enum { + NVME_CTRL_CMIC_MULTI_PORT = 1 << 0, NVME_CTRL_CMIC_MULTI_CTRL = 1 << 1, NVME_CTRL_CMIC_ANA = 1 << 3, NVME_CTRL_ONCS_COMPARE = 1 << 0, @@ -1303,6 +1318,12 @@ struct nvmf_common_command { #define MAX_DISC_LOGS 255 +/* Discovery log page entry flags (EFLAGS): */ +enum { + NVME_DISC_EFLAGS_EPCSD = (1 << 1), + NVME_DISC_EFLAGS_DUPRETINFO = (1 << 0), +}; + /* Discovery log page entry */ struct nvmf_disc_rsp_page_entry { __u8 trtype; @@ -1312,7 +1333,8 @@ struct nvmf_disc_rsp_page_entry { __le16 portid; __le16 cntlid; __le16 asqsz; - __u8 resv8[22]; + __le16 eflags; + __u8 resv10[20]; char trsvcid[NVMF_TRSVCID_SIZE]; __u8 resv64[192]; char subnqn[NVMF_NQN_FIELD_LEN]; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a558d67ee86f..d8623d6e1141 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -143,6 +143,8 @@ enum pageflags { #endif __NR_PAGEFLAGS, + PG_readahead = PG_reclaim, + /* Filesystems */ PG_checked = PG_owner_priv_1, @@ -171,6 +173,15 @@ enum pageflags { /* Compound pages. Stored in first tail page's flags */ PG_double_map = PG_workingset, +#ifdef CONFIG_MEMORY_FAILURE + /* + * Compound pages. Stored in first tail page's flags. + * Indicates that at least one subpage is hwpoisoned in the + * THP. + */ + PG_has_hwpoisoned = PG_mappedtodisk, +#endif + /* non-lru isolated movable page */ PG_isolated = PG_reclaim, @@ -193,6 +204,34 @@ static inline unsigned long _compound_head(const struct page *page) #define compound_head(page) ((typeof(page))_compound_head(page)) +/** + * page_folio - Converts from page to folio. + * @p: The page. + * + * Every page is part of a folio. This function cannot be called on a + * NULL pointer. + * + * Context: No reference, nor lock is required on @page. If the caller + * does not hold a reference, this call may race with a folio split, so + * it should re-check the folio still contains this page after gaining + * a reference on the folio. + * Return: The folio which contains this page. + */ +#define page_folio(p) (_Generic((p), \ + const struct page *: (const struct folio *)_compound_head(p), \ + struct page *: (struct folio *)_compound_head(p))) + +/** + * folio_page - Return a page from a folio. + * @folio: The folio. + * @n: The page number to return. + * + * @n is relative to the start of the folio. This function does not + * check that the page number lies within @folio; the caller is presumed + * to have a reference to the page. + */ +#define folio_page(folio, n) nth_page(&(folio)->page, n) + static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; @@ -217,6 +256,15 @@ static inline void page_init_poison(struct page *page, size_t size) } #endif +static unsigned long *folio_flags(struct folio *folio, unsigned n) +{ + struct page *page = &folio->page; + + VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); + return &page[n].flags; +} + /* * Page flags policies wrt compound pages * @@ -261,36 +309,64 @@ static inline void page_init_poison(struct page *page, size_t size) VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) +/* Which page is the flag stored in */ +#define FOLIO_PF_ANY 0 +#define FOLIO_PF_HEAD 0 +#define FOLIO_PF_ONLY_HEAD 0 +#define FOLIO_PF_NO_TAIL 0 +#define FOLIO_PF_NO_COMPOUND 0 +#define FOLIO_PF_SECOND 1 + /* * Macros to create function definitions for page flags */ #define TESTPAGEFLAG(uname, lname, policy) \ +static __always_inline bool folio_test_##lname(struct folio *folio) \ +{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int Page##uname(struct page *page) \ - { return test_bit(PG_##lname, &policy(page, 0)->flags); } +{ return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void folio_set_##lname(struct folio *folio) \ +{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void SetPage##uname(struct page *page) \ - { set_bit(PG_##lname, &policy(page, 1)->flags); } +{ set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void folio_clear_##lname(struct folio *folio) \ +{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void ClearPage##uname(struct page *page) \ - { clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void __folio_set_##lname(struct folio *folio) \ +{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __SetPage##uname(struct page *page) \ - { __set_bit(PG_##lname, &policy(page, 1)->flags); } +{ __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void __folio_clear_##lname(struct folio *folio) \ +{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __ClearPage##uname(struct page *page) \ - { __clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ +static __always_inline \ +bool folio_test_set_##lname(struct folio *folio) \ +{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestSetPage##uname(struct page *page) \ - { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ +static __always_inline \ +bool folio_test_clear_##lname(struct folio *folio) \ +{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestClearPage##uname(struct page *page) \ - { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ @@ -306,29 +382,37 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) -#define TESTPAGEFLAG_FALSE(uname) \ +#define TESTPAGEFLAG_FALSE(uname, lname) \ +static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ static inline int Page##uname(const struct page *page) { return 0; } -#define SETPAGEFLAG_NOOP(uname) \ +#define SETPAGEFLAG_NOOP(uname, lname) \ +static inline void folio_set_##lname(struct folio *folio) { } \ static inline void SetPage##uname(struct page *page) { } -#define CLEARPAGEFLAG_NOOP(uname) \ +#define CLEARPAGEFLAG_NOOP(uname, lname) \ +static inline void folio_clear_##lname(struct folio *folio) { } \ static inline void ClearPage##uname(struct page *page) { } -#define __CLEARPAGEFLAG_NOOP(uname) \ +#define __CLEARPAGEFLAG_NOOP(uname, lname) \ +static inline void __folio_clear_##lname(struct folio *folio) { } \ static inline void __ClearPage##uname(struct page *page) { } -#define TESTSETFLAG_FALSE(uname) \ +#define TESTSETFLAG_FALSE(uname, lname) \ +static inline bool folio_test_set_##lname(struct folio *folio) \ +{ return 0; } \ static inline int TestSetPage##uname(struct page *page) { return 0; } -#define TESTCLEARFLAG_FALSE(uname) \ +#define TESTCLEARFLAG_FALSE(uname, lname) \ +static inline bool folio_test_clear_##lname(struct folio *folio) \ +{ return 0; } \ static inline int TestClearPage##uname(struct page *page) { return 0; } -#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ - SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) +#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ + SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) -#define TESTSCFLAG_FALSE(uname) \ - TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) +#define TESTSCFLAG_FALSE(uname, lname) \ + TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) @@ -384,8 +468,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) -PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) - TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) + TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* @@ -394,22 +478,25 @@ PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #else -PAGEFLAG_FALSE(HighMem) +PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP -static __always_inline int PageSwapCache(struct page *page) +static __always_inline bool folio_test_swapcache(struct folio *folio) { -#ifdef CONFIG_THP_SWAP - page = compound_head(page); -#endif - return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags); + return folio_test_swapbacked(folio) && + test_bit(PG_swapcache, folio_flags(folio, 0)); +} +static __always_inline bool PageSwapCache(struct page *page) +{ + return folio_test_swapcache(page_folio(page)); } + SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) #else -PAGEFLAG_FALSE(SwapCache) +PAGEFLAG_FALSE(SwapCache, swapcache) #endif PAGEFLAG(Unevictable, unevictable, PF_HEAD) @@ -421,14 +508,14 @@ PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) #else -PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) - TESTSCFLAG_FALSE(Mlocked) +PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked) + TESTSCFLAG_FALSE(Mlocked, mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else -PAGEFLAG_FALSE(Uncached) +PAGEFLAG_FALSE(Uncached, uncached) #endif #ifdef CONFIG_MEMORY_FAILURE @@ -437,7 +524,7 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) extern bool take_page_off_buddy(struct page *page); #else -PAGEFLAG_FALSE(HWPoison) +PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif @@ -451,7 +538,7 @@ PAGEFLAG(Idle, idle, PF_ANY) #ifdef CONFIG_KASAN_HW_TAGS PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD) #else -PAGEFLAG_FALSE(SkipKASanPoison) +PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison) #endif /* @@ -489,10 +576,14 @@ static __always_inline int PageMappingFlags(struct page *page) return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline int PageAnon(struct page *page) +static __always_inline bool folio_test_anon(struct folio *folio) +{ + return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; +} + +static __always_inline bool PageAnon(struct page *page) { - page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return folio_test_anon(page_folio(page)); } static __always_inline int __PageMovable(struct page *page) @@ -508,30 +599,32 @@ static __always_inline int __PageMovable(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static __always_inline int PageKsm(struct page *page) +static __always_inline bool folio_test_ksm(struct folio *folio) { - page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } + +static __always_inline bool PageKsm(struct page *page) +{ + return folio_test_ksm(page_folio(page)); +} #else -TESTPAGEFLAG_FALSE(Ksm) +TESTPAGEFLAG_FALSE(Ksm, ksm) #endif u64 stable_page_flags(struct page *page); -static inline int PageUptodate(struct page *page) +static inline bool folio_test_uptodate(struct folio *folio) { - int ret; - page = compound_head(page); - ret = test_bit(PG_uptodate, &(page)->flags); + bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); /* - * Must ensure that the data we read out of the page is loaded - * _after_ we've loaded page->flags to check for PageUptodate. - * We can skip the barrier if the page is not uptodate, because + * Must ensure that the data we read out of the folio is loaded + * _after_ we've loaded folio->flags to check the uptodate bit. + * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * - * See SetPageUptodate() for the other side of the story. + * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); @@ -539,47 +632,71 @@ static inline int PageUptodate(struct page *page) return ret; } -static __always_inline void __SetPageUptodate(struct page *page) +static inline int PageUptodate(struct page *page) +{ + return folio_test_uptodate(page_folio(page)); +} + +static __always_inline void __folio_mark_uptodate(struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); - __set_bit(PG_uptodate, &page->flags); + __set_bit(PG_uptodate, folio_flags(folio, 0)); } -static __always_inline void SetPageUptodate(struct page *page) +static __always_inline void folio_mark_uptodate(struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); /* * Memory barrier must be issued before setting the PG_uptodate bit, - * so that all previous stores issued in order to bring the page - * uptodate are actually visible before PageUptodate becomes true. + * so that all previous stores issued in order to bring the folio + * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); - set_bit(PG_uptodate, &page->flags); + set_bit(PG_uptodate, folio_flags(folio, 0)); +} + +static __always_inline void __SetPageUptodate(struct page *page) +{ + __folio_mark_uptodate((struct folio *)page); +} + +static __always_inline void SetPageUptodate(struct page *page) +{ + folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -int test_clear_page_writeback(struct page *page); -int __test_set_page_writeback(struct page *page, bool keep_write); +bool __folio_start_writeback(struct folio *folio, bool keep_write); +bool set_page_writeback(struct page *page); -#define test_set_page_writeback(page) \ - __test_set_page_writeback(page, false) -#define test_set_page_writeback_keepwrite(page) \ - __test_set_page_writeback(page, true) +#define folio_start_writeback(folio) \ + __folio_start_writeback(folio, false) +#define folio_start_writeback_keepwrite(folio) \ + __folio_start_writeback(folio, true) -static inline void set_page_writeback(struct page *page) +static inline void set_page_writeback_keepwrite(struct page *page) { - test_set_page_writeback(page); + folio_start_writeback_keepwrite(page_folio(page)); } -static inline void set_page_writeback_keepwrite(struct page *page) +static inline bool test_set_page_writeback(struct page *page) { - test_set_page_writeback_keepwrite(page); + return set_page_writeback(page); } __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +/* Whether there are one or multiple pages in a folio */ +static inline bool folio_test_single(struct folio *folio) +{ + return !folio_test_head(folio); +} + +static inline bool folio_test_multi(struct folio *folio) +{ + return folio_test_head(folio); +} + static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); @@ -603,12 +720,15 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); +static inline bool folio_test_hugetlb(struct folio *folio) +{ + return PageHeadHuge(&folio->page); +} #else -TESTPAGEFLAG_FALSE(Huge) -TESTPAGEFLAG_FALSE(HeadHuge) +TESTPAGEFLAG_FALSE(Huge, hugetlb) +TESTPAGEFLAG_FALSE(HeadHuge, headhuge) #endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -624,6 +744,11 @@ static inline int PageTransHuge(struct page *page) return PageHead(page); } +static inline bool folio_test_transhuge(struct folio *folio) +{ + return folio_test_head(folio); +} + /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known @@ -660,12 +785,26 @@ static inline int PageTransTail(struct page *page) PAGEFLAG(DoubleMap, double_map, PF_SECOND) TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else -TESTPAGEFLAG_FALSE(TransHuge) -TESTPAGEFLAG_FALSE(TransCompound) -TESTPAGEFLAG_FALSE(TransCompoundMap) -TESTPAGEFLAG_FALSE(TransTail) -PAGEFLAG_FALSE(DoubleMap) - TESTSCFLAG_FALSE(DoubleMap) +TESTPAGEFLAG_FALSE(TransHuge, transhuge) +TESTPAGEFLAG_FALSE(TransCompound, transcompound) +TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) +TESTPAGEFLAG_FALSE(TransTail, transtail) +PAGEFLAG_FALSE(DoubleMap, double_map) + TESTSCFLAG_FALSE(DoubleMap, double_map) +#endif + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +/* + * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the + * compound page. + * + * This flag is set by hwpoison handler. Cleared by THP split or free page. + */ +PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) + TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +#else +PAGEFLAG_FALSE(HasHWPoisoned) + TESTSCFLAG_FALSE(HasHWPoisoned) #endif /* @@ -849,6 +988,11 @@ static inline int page_has_private(struct page *page) return !!(page->flags & PAGE_FLAGS_PRIVATE); } +static inline bool folio_has_private(struct folio *folio) +{ + return page_has_private(&folio->page); +} + #undef PF_ANY #undef PF_HEAD #undef PF_ONLY_HEAD diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index d8a6aecf99cb..83abf95e9fa7 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -8,46 +8,16 @@ #ifdef CONFIG_PAGE_IDLE_FLAG -#ifdef CONFIG_64BIT -static inline bool page_is_young(struct page *page) -{ - return PageYoung(page); -} - -static inline void set_page_young(struct page *page) -{ - SetPageYoung(page); -} - -static inline bool test_and_clear_page_young(struct page *page) -{ - return TestClearPageYoung(page); -} - -static inline bool page_is_idle(struct page *page) -{ - return PageIdle(page); -} - -static inline void set_page_idle(struct page *page) -{ - SetPageIdle(page); -} - -static inline void clear_page_idle(struct page *page) -{ - ClearPageIdle(page); -} -#else /* !CONFIG_64BIT */ +#ifndef CONFIG_64BIT /* * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ extern struct page_ext_operations page_idle_ops; -static inline bool page_is_young(struct page *page) +static inline bool folio_test_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -55,9 +25,9 @@ static inline bool page_is_young(struct page *page) return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline void set_page_young(struct page *page) +static inline void folio_set_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; @@ -65,9 +35,9 @@ static inline void set_page_young(struct page *page) set_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline bool test_and_clear_page_young(struct page *page) +static inline bool folio_test_clear_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -75,9 +45,9 @@ static inline bool test_and_clear_page_young(struct page *page) return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline bool page_is_idle(struct page *page) +static inline bool folio_test_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -85,9 +55,9 @@ static inline bool page_is_idle(struct page *page) return test_bit(PAGE_EXT_IDLE, &page_ext->flags); } -static inline void set_page_idle(struct page *page) +static inline void folio_set_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; @@ -95,46 +65,75 @@ static inline void set_page_idle(struct page *page) set_bit(PAGE_EXT_IDLE, &page_ext->flags); } -static inline void clear_page_idle(struct page *page) +static inline void folio_clear_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); } -#endif /* CONFIG_64BIT */ +#endif /* !CONFIG_64BIT */ #else /* !CONFIG_PAGE_IDLE_FLAG */ -static inline bool page_is_young(struct page *page) +static inline bool folio_test_young(struct folio *folio) { return false; } -static inline void set_page_young(struct page *page) +static inline void folio_set_young(struct folio *folio) { } -static inline bool test_and_clear_page_young(struct page *page) +static inline bool folio_test_clear_young(struct folio *folio) { return false; } -static inline bool page_is_idle(struct page *page) +static inline bool folio_test_idle(struct folio *folio) { return false; } -static inline void set_page_idle(struct page *page) +static inline void folio_set_idle(struct folio *folio) { } -static inline void clear_page_idle(struct page *page) +static inline void folio_clear_idle(struct folio *folio) { } #endif /* CONFIG_PAGE_IDLE_FLAG */ +static inline bool page_is_young(struct page *page) +{ + return folio_test_young(page_folio(page)); +} + +static inline void set_page_young(struct page *page) +{ + folio_set_young(page_folio(page)); +} + +static inline bool test_and_clear_page_young(struct page *page) +{ + return folio_test_clear_young(page_folio(page)); +} + +static inline bool page_is_idle(struct page *page) +{ + return folio_test_idle(page_folio(page)); +} + +static inline void set_page_idle(struct page *page) +{ + folio_set_idle(page_folio(page)); +} + +static inline void clear_page_idle(struct page *page) +{ + folio_clear_idle(page_folio(page)); +} #endif /* _LINUX_MM_PAGE_IDLE_H */ diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 719bfe5108c5..43c638c51c1f 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -12,7 +12,7 @@ extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); extern void __split_page_owner(struct page *page, unsigned int nr); -extern void __copy_page_owner(struct page *oldpage, struct page *newpage); +extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -36,10 +36,10 @@ static inline void split_page_owner(struct page *page, unsigned int nr) if (static_branch_unlikely(&page_owner_inited)) __split_page_owner(page, nr); } -static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { if (static_branch_unlikely(&page_owner_inited)) - __copy_page_owner(oldpage, newpage); + __folio_copy_owner(newfolio, old); } static inline void set_page_owner_migrate_reason(struct page *page, int reason) { @@ -63,7 +63,7 @@ static inline void split_page_owner(struct page *page, unsigned int order) { } -static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) { } static inline void set_page_owner_migrate_reason(struct page *page, int reason) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 7ad46f45df39..2e677e6ad09f 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -67,9 +67,31 @@ static inline int page_ref_count(const struct page *page) return atomic_read(&page->_refcount); } +/** + * folio_ref_count - The reference count on this folio. + * @folio: The folio. + * + * The refcount is usually incremented by calls to folio_get() and + * decremented by calls to folio_put(). Some typical users of the + * folio refcount: + * + * - Each reference from a page table + * - The page cache + * - Filesystem private data + * - The LRU list + * - Pipes + * - Direct IO which references this page in the process address space + * + * Return: The number of references to this folio. + */ +static inline int folio_ref_count(const struct folio *folio) +{ + return page_ref_count(&folio->page); +} + static inline int page_count(const struct page *page) { - return atomic_read(&compound_head(page)->_refcount); + return folio_ref_count(page_folio(page)); } static inline void set_page_count(struct page *page, int v) @@ -79,6 +101,11 @@ static inline void set_page_count(struct page *page, int v) __page_ref_set(page, v); } +static inline void folio_set_count(struct folio *folio, int v) +{ + set_page_count(&folio->page, v); +} + /* * Setup the page count before being freed into the page allocator for * the first time (boot or memory hotplug) @@ -95,6 +122,11 @@ static inline void page_ref_add(struct page *page, int nr) __page_ref_mod(page, nr); } +static inline void folio_ref_add(struct folio *folio, int nr) +{ + page_ref_add(&folio->page, nr); +} + static inline void page_ref_sub(struct page *page, int nr) { atomic_sub(nr, &page->_refcount); @@ -102,6 +134,11 @@ static inline void page_ref_sub(struct page *page, int nr) __page_ref_mod(page, -nr); } +static inline void folio_ref_sub(struct folio *folio, int nr) +{ + page_ref_sub(&folio->page, nr); +} + static inline int page_ref_sub_return(struct page *page, int nr) { int ret = atomic_sub_return(nr, &page->_refcount); @@ -111,6 +148,11 @@ static inline int page_ref_sub_return(struct page *page, int nr) return ret; } +static inline int folio_ref_sub_return(struct folio *folio, int nr) +{ + return page_ref_sub_return(&folio->page, nr); +} + static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_refcount); @@ -118,6 +160,11 @@ static inline void page_ref_inc(struct page *page) __page_ref_mod(page, 1); } +static inline void folio_ref_inc(struct folio *folio) +{ + page_ref_inc(&folio->page); +} + static inline void page_ref_dec(struct page *page) { atomic_dec(&page->_refcount); @@ -125,6 +172,11 @@ static inline void page_ref_dec(struct page *page) __page_ref_mod(page, -1); } +static inline void folio_ref_dec(struct folio *folio) +{ + page_ref_dec(&folio->page); +} + static inline int page_ref_sub_and_test(struct page *page, int nr) { int ret = atomic_sub_and_test(nr, &page->_refcount); @@ -134,6 +186,11 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) return ret; } +static inline int folio_ref_sub_and_test(struct folio *folio, int nr) +{ + return page_ref_sub_and_test(&folio->page, nr); +} + static inline int page_ref_inc_return(struct page *page) { int ret = atomic_inc_return(&page->_refcount); @@ -143,6 +200,11 @@ static inline int page_ref_inc_return(struct page *page) return ret; } +static inline int folio_ref_inc_return(struct folio *folio) +{ + return page_ref_inc_return(&folio->page); +} + static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); @@ -152,6 +214,11 @@ static inline int page_ref_dec_and_test(struct page *page) return ret; } +static inline int folio_ref_dec_and_test(struct folio *folio) +{ + return page_ref_dec_and_test(&folio->page); +} + static inline int page_ref_dec_return(struct page *page) { int ret = atomic_dec_return(&page->_refcount); @@ -161,15 +228,91 @@ static inline int page_ref_dec_return(struct page *page) return ret; } -static inline int page_ref_add_unless(struct page *page, int nr, int u) +static inline int folio_ref_dec_return(struct folio *folio) +{ + return page_ref_dec_return(&folio->page); +} + +static inline bool page_ref_add_unless(struct page *page, int nr, int u) { - int ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } +static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) +{ + return page_ref_add_unless(&folio->page, nr, u); +} + +/** + * folio_try_get - Attempt to increase the refcount on a folio. + * @folio: The folio. + * + * If you do not already have a reference to a folio, you can attempt to + * get one using this function. It may fail if, for example, the folio + * has been freed since you found a pointer to it, or it is frozen for + * the purposes of splitting or migration. + * + * Return: True if the reference count was successfully incremented. + */ +static inline bool folio_try_get(struct folio *folio) +{ + return folio_ref_add_unless(folio, 1, 0); +} + +static inline bool folio_ref_try_add_rcu(struct folio *folio, int count) +{ +#ifdef CONFIG_TINY_RCU + /* + * The caller guarantees the folio will not be freed from interrupt + * context, so (on !SMP) we only need preemption to be disabled + * and TINY_RCU does that for us. + */ +# ifdef CONFIG_PREEMPT_COUNT + VM_BUG_ON(!in_atomic() && !irqs_disabled()); +# endif + VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio); + folio_ref_add(folio, count); +#else + if (unlikely(!folio_ref_add_unless(folio, count, 0))) { + /* Either the folio has been freed, or will be freed. */ + return false; + } +#endif + return true; +} + +/** + * folio_try_get_rcu - Attempt to increase the refcount on a folio. + * @folio: The folio. + * + * This is a version of folio_try_get() optimised for non-SMP kernels. + * If you are still holding the rcu_read_lock() after looking up the + * page and know that the page cannot have its refcount decreased to + * zero in interrupt context, you can use this instead of folio_try_get(). + * + * Example users include get_user_pages_fast() (as pages are not unmapped + * from interrupt context) and the page cache lookups (as pages are not + * truncated from interrupt context). We also know that pages are not + * frozen in interrupt context for the purposes of splitting or migration. + * + * You can also use this function if you're holding a lock that prevents + * pages being frozen & removed; eg the i_pages lock for the page cache + * or the mmap_sem or page table lock for page tables. In this case, + * it will always succeed, and you could have used a plain folio_get(), + * but it's sometimes more convenient to have a common function called + * from both locked and RCU-protected contexts. + * + * Return: True if the reference count was successfully incremented. + */ +static inline bool folio_try_get_rcu(struct folio *folio) +{ + return folio_ref_try_add_rcu(folio, 1); +} + static inline int page_ref_freeze(struct page *page, int count) { int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); @@ -179,6 +322,11 @@ static inline int page_ref_freeze(struct page *page, int count) return ret; } +static inline int folio_ref_freeze(struct folio *folio, int count) +{ + return page_ref_freeze(&folio->page, count); +} + static inline void page_ref_unfreeze(struct page *page, int count) { VM_BUG_ON_PAGE(page_count(page) != 0, page); @@ -189,4 +337,8 @@ static inline void page_ref_unfreeze(struct page *page, int count) __page_ref_unfreeze(page, count); } +static inline void folio_ref_unfreeze(struct folio *folio, int count) +{ + page_ref_unfreeze(&folio->page, count); +} #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 62db6b0176b9..013cdc90f5fd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -162,149 +162,119 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) void release_pages(struct page **pages, int nr); -/* - * For file cache pages, return the address_space, otherwise return NULL +struct address_space *page_mapping(struct page *); +struct address_space *folio_mapping(struct folio *); +struct address_space *swapcache_mapping(struct folio *); + +/** + * folio_file_mapping - Find the mapping this folio belongs to. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the mapping of the + * swap file or swap device where the data is stored. This is different + * from the mapping returned by folio_mapping(). The only reason to + * use it is if, like NFS, you return 0 from ->activate_swapfile. + * + * Do not call this for folios which aren't in the page cache or swap cache. */ -static inline struct address_space *page_mapping_file(struct page *page) +static inline struct address_space *folio_file_mapping(struct folio *folio) { - if (unlikely(PageSwapCache(page))) - return NULL; - return page_mapping(page); + if (unlikely(folio_test_swapcache(folio))) + return swapcache_mapping(folio); + + return folio->mapping; +} + +static inline struct address_space *page_file_mapping(struct page *page) +{ + return folio_file_mapping(page_folio(page)); } /* - * speculatively take a reference to a page. - * If the page is free (_refcount == 0), then _refcount is untouched, and 0 - * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. - * - * This function must be called inside the same rcu_read_lock() section as has - * been used to lookup the page in the pagecache radix-tree (or page table): - * this allows allocators to use a synchronize_rcu() to stabilize _refcount. - * - * Unless an RCU grace period has passed, the count of all pages coming out - * of the allocator must be considered unstable. page_count may return higher - * than expected, and put_page must be able to do the right thing when the - * page has been finished with, no matter what it is subsequently allocated - * for (because put_page is what is used here to drop an invalid speculative - * reference). - * - * This is the interesting part of the lockless pagecache (and lockless - * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page) - * has the following pattern: - * 1. find page in radix tree - * 2. conditionally increment refcount - * 3. check the page is still in pagecache (if no, goto 1) - * - * Remove-side that cares about stability of _refcount (eg. reclaim) has the - * following (with the i_pages lock held): - * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) - * B. remove page from pagecache - * C. free the page - * - * There are 2 critical interleavings that matter: - * - 2 runs before A: in this case, A sees elevated refcount and bails out - * - A runs before 2: in this case, 2 sees zero refcount and retries; - * subsequently, B will complete and 1 will find no page, causing the - * lookup to return NULL. - * - * It is possible that between 1 and 2, the page is removed then the exact same - * page is inserted into the same position in pagecache. That's OK: the - * old find_get_page using a lock could equally have run before or after - * such a re-insertion, depending on order that locks are granted. - * - * Lookups racing against pagecache insertion isn't a big problem: either 1 - * will find the page or it will not. Likewise, the old find_get_page could run - * either before the insertion or afterwards, depending on timing. + * For file cache pages, return the address_space, otherwise return NULL */ -static inline int __page_cache_add_speculative(struct page *page, int count) +static inline struct address_space *page_mapping_file(struct page *page) { -#ifdef CONFIG_TINY_RCU -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - /* - * Preempt must be disabled here - we rely on rcu_read_lock doing - * this for us. - * - * Pagecache won't be truncated from interrupt context, so if we have - * found a page in the radix tree here, we have pinned its refcount by - * disabling preempt, and hence no need for the "speculative get" that - * SMP requires. - */ - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, count); + struct folio *folio = page_folio(page); -#else - if (unlikely(!page_ref_add_unless(page, count, 0))) { - /* - * Either the page has been freed, or will be freed. - * In either case, retry here and the caller should - * do the right thing (see comments above). - */ - return 0; - } -#endif - VM_BUG_ON_PAGE(PageTail(page), page); - - return 1; + if (unlikely(folio_test_swapcache(folio))) + return NULL; + return folio_mapping(folio); } -static inline int page_cache_get_speculative(struct page *page) +static inline bool page_cache_add_speculative(struct page *page, int count) { - return __page_cache_add_speculative(page, 1); + VM_BUG_ON_PAGE(PageTail(page), page); + return folio_ref_try_add_rcu((struct folio *)page, count); } -static inline int page_cache_add_speculative(struct page *page, int count) +static inline bool page_cache_get_speculative(struct page *page) { - return __page_cache_add_speculative(page, count); + return page_cache_add_speculative(page, 1); } /** - * attach_page_private - Attach private data to a page. - * @page: Page to attach data to. - * @data: Data to attach to page. + * folio_attach_private - Attach private data to a folio. + * @folio: Folio to attach data to. + * @data: Data to attach to folio. * - * Attaching private data to a page increments the page's reference count. - * The data must be detached before the page will be freed. + * Attaching private data to a folio increments the page's reference count. + * The data must be detached before the folio will be freed. */ -static inline void attach_page_private(struct page *page, void *data) +static inline void folio_attach_private(struct folio *folio, void *data) { - get_page(page); - set_page_private(page, (unsigned long)data); - SetPagePrivate(page); + folio_get(folio); + folio->private = data; + folio_set_private(folio); } /** - * detach_page_private - Detach private data from a page. - * @page: Page to detach data from. + * folio_detach_private - Detach private data from a folio. + * @folio: Folio to detach data from. * - * Removes the data that was previously attached to the page and decrements + * Removes the data that was previously attached to the folio and decrements * the refcount on the page. * - * Return: Data that was attached to the page. + * Return: Data that was attached to the folio. */ -static inline void *detach_page_private(struct page *page) +static inline void *folio_detach_private(struct folio *folio) { - void *data = (void *)page_private(page); + void *data = folio_get_private(folio); - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + folio_clear_private(folio); + folio->private = NULL; + folio_put(folio); return data; } +static inline void attach_page_private(struct page *page, void *data) +{ + folio_attach_private(page_folio(page), data); +} + +static inline void *detach_page_private(struct page *page) +{ + return folio_detach_private(page_folio(page)); +} + #ifdef CONFIG_NUMA -extern struct page *__page_cache_alloc(gfp_t gfp); +struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); #else -static inline struct page *__page_cache_alloc(gfp_t gfp) +static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) { - return alloc_pages(gfp, 0); + return folio_alloc(gfp, order); } #endif +static inline struct page *__page_cache_alloc(gfp_t gfp) +{ + return &filemap_alloc_folio(gfp, 0)->page; +} + static inline struct page *page_cache_alloc(struct address_space *x) { return __page_cache_alloc(mapping_gfp_mask(x)); @@ -331,9 +301,28 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_FOR_MMAP 0x00000040 #define FGP_HEAD 0x00000080 #define FGP_ENTRY 0x00000100 +#define FGP_STABLE 0x00000200 -struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, - int fgp_flags, gfp_t cache_gfp_mask); +struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp); +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp); + +/** + * filemap_get_folio - Find and get a folio. + * @mapping: The address_space to search. + * @index: The page index. + * + * Looks up the page cache entry at @mapping & @index. If a folio is + * present, it is returned with an increased refcount. + * + * Otherwise, %NULL is returned. + */ +static inline struct folio *filemap_get_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, 0, 0); +} /** * find_get_page - find and get a page reference @@ -377,25 +366,6 @@ static inline struct page *find_lock_page(struct address_space *mapping, } /** - * find_lock_head - Locate, pin and lock a pagecache page. - * @mapping: The address_space to search. - * @index: The page index. - * - * Looks up the page cache entry at @mapping & @index. If there is a - * page cache page, its head page is returned locked and with an increased - * refcount. - * - * Context: May sleep. - * Return: A struct page which is !PageTail, or %NULL if there is no page - * in the cache for this index. - */ -static inline struct page *find_lock_head(struct address_space *mapping, - pgoff_t index) -{ - return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); -} - -/** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space * @index: the page's index into the mapping @@ -452,6 +422,73 @@ static inline bool thp_contains(struct page *head, pgoff_t index) return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); } +#define swapcache_index(folio) __page_file_index(&(folio)->page) + +/** + * folio_index - File index of a folio. + * @folio: The folio. + * + * For a folio which is either in the page cache or the swap cache, + * return its index within the address_space it belongs to. If you know + * the page is definitely in the page cache, you can look at the folio's + * index directly. + * + * Return: The index (offset in units of pages) of a folio in its file. + */ +static inline pgoff_t folio_index(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swapcache_index(folio); + return folio->index; +} + +/** + * folio_next_index - Get the index of the next folio. + * @folio: The current folio. + * + * Return: The index of the folio which follows this folio in the file. + */ +static inline pgoff_t folio_next_index(struct folio *folio) +{ + return folio->index + folio_nr_pages(folio); +} + +/** + * folio_file_page - The page for a particular index. + * @folio: The folio which contains this index. + * @index: The index we want to look up. + * + * Sometimes after looking up a folio in the page cache, we need to + * obtain the specific page for an index (eg a page fault). + * + * Return: The page containing the file data for this index. + */ +static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (folio_test_hugetlb(folio)) + return &folio->page; + return folio_page(folio, index & (folio_nr_pages(folio) - 1)); +} + +/** + * folio_contains - Does this folio contain this index? + * @folio: The folio. + * @index: The page index within the file. + * + * Context: The caller should have the page locked in order to prevent + * (eg) shmem from moving the page between the page cache and swap cache + * and changing its index in the middle of the operation. + * Return: true or false. + */ +static inline bool folio_contains(struct folio *folio, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (folio_test_hugetlb(folio)) + return folio->index == index; + return index - folio_index(folio) < folio_nr_pages(folio); +} + /* * Given the page we found in the page cache, return the page corresponding * to this index in the file @@ -560,6 +597,27 @@ static inline loff_t page_file_offset(struct page *page) return ((loff_t)page_index(page)) << PAGE_SHIFT; } +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. + */ +static inline loff_t folio_pos(struct folio *folio) +{ + return page_offset(&folio->page); +} + +/** + * folio_file_pos - Returns the byte position of this folio in its file. + * @folio: The folio. + * + * This differs from folio_pos() for folios which belong to a swap file. + * NFS is the only filesystem today which needs to use folio_file_pos(). + */ +static inline loff_t folio_file_pos(struct folio *folio) +{ + return page_file_offset(&folio->page); +} + extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, unsigned long address); @@ -575,13 +633,13 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, } struct wait_page_key { - struct page *page; + struct folio *folio; int bit_nr; int page_match; }; struct wait_page_queue { - struct page *page; + struct folio *folio; int bit_nr; wait_queue_entry_t wait; }; @@ -589,7 +647,7 @@ struct wait_page_queue { static inline bool wake_page_match(struct wait_page_queue *wait_page, struct wait_page_key *key) { - if (wait_page->page != key->page) + if (wait_page->folio != key->folio) return false; key->page_match = 1; @@ -599,20 +657,31 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, return true; } -extern void __lock_page(struct page *page); -extern int __lock_page_killable(struct page *page); -extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); -extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, +void __folio_lock(struct folio *folio); +int __folio_lock_killable(struct folio *folio); +bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, unsigned int flags); -extern void unlock_page(struct page *page); +void unlock_page(struct page *page); +void folio_unlock(struct folio *folio); + +static inline bool folio_trylock(struct folio *folio) +{ + return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0))); +} /* * Return true if the page was successfully locked */ static inline int trylock_page(struct page *page) { - page = compound_head(page); - return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); + return folio_trylock(page_folio(page)); +} + +static inline void folio_lock(struct folio *folio) +{ + might_sleep(); + if (!folio_trylock(folio)) + __folio_lock(folio); } /* @@ -620,38 +689,30 @@ static inline int trylock_page(struct page *page) */ static inline void lock_page(struct page *page) { + struct folio *folio; might_sleep(); - if (!trylock_page(page)) - __lock_page(page); + + folio = page_folio(page); + if (!folio_trylock(folio)) + __folio_lock(folio); } -/* - * lock_page_killable is like lock_page but can be interrupted by fatal - * signals. It returns 0 if it locked the page and -EINTR if it was - * killed while waiting. - */ -static inline int lock_page_killable(struct page *page) +static inline int folio_lock_killable(struct folio *folio) { might_sleep(); - if (!trylock_page(page)) - return __lock_page_killable(page); + if (!folio_trylock(folio)) + return __folio_lock_killable(folio); return 0; } /* - * lock_page_async - Lock the page, unless this would block. If the page - * is already locked, then queue a callback when the page becomes unlocked. - * This callback can then retry the operation. - * - * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page - * was already locked and the callback defined in 'wait' was queued. + * lock_page_killable is like lock_page but can be interrupted by fatal + * signals. It returns 0 if it locked the page and -EINTR if it was + * killed while waiting. */ -static inline int lock_page_async(struct page *page, - struct wait_page_queue *wait) +static inline int lock_page_killable(struct page *page) { - if (!trylock_page(page)) - return __lock_page_async(page, wait); - return 0; + return folio_lock_killable(page_folio(page)); } /* @@ -659,78 +720,108 @@ static inline int lock_page_async(struct page *page, * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see - * __lock_page_or_retry(). + * __folio_lock_or_retry(). */ -static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, +static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { + struct folio *folio; might_sleep(); - return trylock_page(page) || __lock_page_or_retry(page, mm, flags); + + folio = page_folio(page); + return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); } /* - * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc., + * This is exported only for folio_wait_locked/folio_wait_writeback, etc., * and should not be used directly. */ -extern void wait_on_page_bit(struct page *page, int bit_nr); -extern int wait_on_page_bit_killable(struct page *page, int bit_nr); +void folio_wait_bit(struct folio *folio, int bit_nr); +int folio_wait_bit_killable(struct folio *folio, int bit_nr); /* - * Wait for a page to be unlocked. + * Wait for a folio to be unlocked. * - * This must be called with the caller "holding" the page, - * ie with increased "page->count" so that the page won't + * This must be called with the caller "holding" the folio, + * ie with increased "page->count" so that the folio won't * go away during the wait.. */ +static inline void folio_wait_locked(struct folio *folio) +{ + if (folio_test_locked(folio)) + folio_wait_bit(folio, PG_locked); +} + +static inline int folio_wait_locked_killable(struct folio *folio) +{ + if (!folio_test_locked(folio)) + return 0; + return folio_wait_bit_killable(folio, PG_locked); +} + static inline void wait_on_page_locked(struct page *page) { - if (PageLocked(page)) - wait_on_page_bit(compound_head(page), PG_locked); + folio_wait_locked(page_folio(page)); } static inline int wait_on_page_locked_killable(struct page *page) { - if (!PageLocked(page)) - return 0; - return wait_on_page_bit_killable(compound_head(page), PG_locked); + return folio_wait_locked_killable(page_folio(page)); } int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); -int wait_on_page_writeback_killable(struct page *page); -extern void end_page_writeback(struct page *page); +void folio_wait_writeback(struct folio *folio); +int folio_wait_writeback_killable(struct folio *folio); +void end_page_writeback(struct page *page); +void folio_end_writeback(struct folio *folio); void wait_for_stable_page(struct page *page); +void folio_wait_stable(struct folio *folio); +void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); +static inline void __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) +{ + __folio_mark_dirty(page_folio(page), mapping, warn); +} +void folio_account_cleaned(struct folio *folio, struct address_space *mapping, + struct bdi_writeback *wb); +static inline void account_page_cleaned(struct page *page, + struct address_space *mapping, struct bdi_writeback *wb) +{ + return folio_account_cleaned(page_folio(page), mapping, wb); +} +void __folio_cancel_dirty(struct folio *folio); +static inline void folio_cancel_dirty(struct folio *folio) +{ + /* Avoid atomic ops, locking, etc. when not actually needed. */ + if (folio_test_dirty(folio)) + __folio_cancel_dirty(folio); +} +static inline void cancel_dirty_page(struct page *page) +{ + folio_cancel_dirty(page_folio(page)); +} +bool folio_clear_dirty_for_io(struct folio *folio); +bool clear_page_dirty_for_io(struct page *page); +int __must_check folio_write_one(struct folio *folio); +static inline int __must_check write_one_page(struct page *page) +{ + return folio_write_one(page_folio(page)); +} -void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); void page_endio(struct page *page, bool is_write, int err); -/** - * set_page_private_2 - Set PG_private_2 on a page and take a ref - * @page: The page. - * - * Set the PG_private_2 flag on a page and take the reference needed for the VM - * to handle its lifetime correctly. This sets the flag and takes the - * reference unconditionally, so care must be taken not to set the flag again - * if it's already set. - */ -static inline void set_page_private_2(struct page *page) -{ - page = compound_head(page); - get_page(page); - SetPagePrivate2(page); -} - -void end_page_private_2(struct page *page); -void wait_on_page_private_2(struct page *page); -int wait_on_page_private_2_killable(struct page *page); +void folio_end_private_2(struct folio *folio); +void folio_wait_private_2(struct folio *folio); +int folio_wait_private_2_killable(struct folio *folio); /* * Add an arbitrary waiter to a page's wait queue */ -extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); +void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. @@ -790,9 +881,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) } int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); + pgoff_t index, gfp_t gfp); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); + pgoff_t index, gfp_t gfp); +int filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); @@ -817,6 +910,10 @@ static inline int add_to_page_cache(struct page *page, return error; } +/* Must be non-static for BPF error injection */ +int __filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp, void **shadowp); + /** * struct readahead_control - Describes a readahead request. * @@ -906,33 +1003,57 @@ void page_cache_async_readahead(struct address_space *mapping, page_cache_async_ra(&ractl, page, req_count); } +static inline struct folio *__readahead_folio(struct readahead_control *ractl) +{ + struct folio *folio; + + BUG_ON(ractl->_batch_count > ractl->_nr_pages); + ractl->_nr_pages -= ractl->_batch_count; + ractl->_index += ractl->_batch_count; + + if (!ractl->_nr_pages) { + ractl->_batch_count = 0; + return NULL; + } + + folio = xa_load(&ractl->mapping->i_pages, ractl->_index); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + ractl->_batch_count = folio_nr_pages(folio); + + return folio; +} + /** * readahead_page - Get the next page to read. - * @rac: The current readahead request. + * @ractl: The current readahead request. * * Context: The page is locked and has an elevated refcount. The caller * should decreases the refcount once the page has been submitted for I/O * and unlock the page once all I/O to that page has completed. * Return: A pointer to the next page, or %NULL if we are done. */ -static inline struct page *readahead_page(struct readahead_control *rac) +static inline struct page *readahead_page(struct readahead_control *ractl) { - struct page *page; - - BUG_ON(rac->_batch_count > rac->_nr_pages); - rac->_nr_pages -= rac->_batch_count; - rac->_index += rac->_batch_count; + struct folio *folio = __readahead_folio(ractl); - if (!rac->_nr_pages) { - rac->_batch_count = 0; - return NULL; - } + return &folio->page; +} - page = xa_load(&rac->mapping->i_pages, rac->_index); - VM_BUG_ON_PAGE(!PageLocked(page), page); - rac->_batch_count = thp_nr_pages(page); +/** + * readahead_folio - Get the next folio to read. + * @ractl: The current readahead request. + * + * Context: The folio is locked. The caller should unlock the folio once + * all I/O to that folio has completed. + * Return: A pointer to the next folio, or %NULL if we are done. + */ +static inline struct folio *readahead_folio(struct readahead_control *ractl) +{ + struct folio *folio = __readahead_folio(ractl); - return page; + if (folio) + folio_put(folio); + return folio; } static inline unsigned int __readahead_batch(struct readahead_control *rac, @@ -1040,6 +1161,34 @@ static inline unsigned long dir_pages(struct inode *inode) } /** + * folio_mkwrite_check_truncate - check if folio was truncated + * @folio: the folio to check + * @inode: the inode to check the folio against + * + * Return: the number of bytes in the folio up to EOF, + * or -EFAULT if the folio was truncated. + */ +static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, + struct inode *inode) +{ + loff_t size = i_size_read(inode); + pgoff_t index = size >> PAGE_SHIFT; + size_t offset = offset_in_folio(folio, size); + + if (!folio->mapping) + return -EFAULT; + + /* folio is wholly inside EOF */ + if (folio_next_index(folio) - 1 < index) + return folio_size(folio); + /* folio is wholly past EOF */ + if (folio->index > index || !offset) + return -EFAULT; + /* folio is partially inside EOF */ + return offset; +} + +/** * page_mkwrite_check_truncate - check if page was truncated * @page: the page to check * @inode: the inode to check the page against @@ -1068,19 +1217,25 @@ static inline int page_mkwrite_check_truncate(struct page *page, } /** - * i_blocks_per_page - How many blocks fit in this page. + * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. - * @page: The page (head page if the page is a THP). + * @folio: The folio. * - * If the block size is larger than the size of this page, return zero. + * If the block size is larger than the size of this folio, return zero. * - * Context: The caller should hold a refcount on the page to prevent it + * Context: The caller should hold a refcount on the folio to prevent it * from being split. - * Return: The number of filesystem blocks covered by this page. + * Return: The number of filesystem blocks covered by this folio. */ static inline +unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) +{ + return folio_size(folio) >> inode->i_blkbits; +} + +static inline unsigned int i_blocks_per_page(struct inode *inode, struct page *page) { - return thp_size(page) >> inode->i_blkbits; + return i_blocks_per_folio(inode, page_folio(page)); } #endif /* _LINUX_PAGEMAP_H */ diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index d2558121d48c..6f7949b2fd8d 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -3,6 +3,7 @@ #define _LINUX_PART_STAT_H #include <linux/genhd.h> +#include <asm/local.h> struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index ae16a9856305..b31d3f3312ce 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -267,6 +267,28 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) } /** + * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the + * caller is responsible for taking RCU. + * + * This function is safe to call as long as @ref is between init and exit. + */ +static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref) +{ + unsigned long __percpu *percpu_count; + bool ret = false; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(__ref_is_percpu(ref, &percpu_count))) { + this_cpu_inc(*percpu_count); + ret = true; + } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { + ret = atomic_long_inc_not_zero(&ref->data->count); + } + return ret; +} + +/** * percpu_ref_tryget_live - try to increment a live percpu refcount * @ref: percpu_ref to try-get * @@ -283,20 +305,11 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { - unsigned long __percpu *percpu_count; bool ret = false; rcu_read_lock(); - - if (__ref_is_percpu(ref, &percpu_count)) { - this_cpu_inc(*percpu_count); - ret = true; - } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { - ret = atomic_long_inc_not_zero(&ref->data->count); - } - + ret = percpu_ref_tryget_live_rcu(ref); rcu_read_unlock(); - return ret; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9b60bb89d86a..22212f2a8219 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1400,6 +1400,7 @@ perf_event_addr_filters(struct perf_event *event) } extern void perf_event_addr_filters_sync(struct perf_event *event); +extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, diff --git a/include/linux/platform_data/usb-omap1.h b/include/linux/platform_data/usb-omap1.h index 43b5ce139c37..878e572a78bf 100644 --- a/include/linux/platform_data/usb-omap1.h +++ b/include/linux/platform_data/usb-omap1.h @@ -48,6 +48,8 @@ struct omap_usb_config { u32 (*usb2_init)(unsigned nwires, unsigned alt_pingroup); int (*ocpi_enable)(void); + + void (*lb_reset)(void); }; #endif /* __LINUX_USB_OMAP1_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..031898b38d06 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -122,9 +122,10 @@ * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) -#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET +#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else -#define PREEMPT_LOCK_OFFSET 0 +/* Locks on RT do not disable preemption */ +#define PREEMPT_LOCK_OFFSET 0 #endif /* diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index c0475d1c9885..81cad9e1e412 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -61,7 +61,6 @@ enum qcom_scm_ice_cipher { #define QCOM_SCM_PERM_RW (QCOM_SCM_PERM_READ | QCOM_SCM_PERM_WRITE) #define QCOM_SCM_PERM_RWX (QCOM_SCM_PERM_RW | QCOM_SCM_PERM_EXEC) -#if IS_ENABLED(CONFIG_QCOM_SCM) extern bool qcom_scm_is_available(void); extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus); @@ -115,74 +114,4 @@ extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, extern int qcom_scm_lmh_profile_change(u32 profile_id); extern bool qcom_scm_lmh_dcvsh_available(void); -#else - -#include <linux/errno.h> - -static inline bool qcom_scm_is_available(void) { return false; } - -static inline int qcom_scm_set_cold_boot_addr(void *entry, - const cpumask_t *cpus) { return -ENODEV; } -static inline int qcom_scm_set_warm_boot_addr(void *entry, - const cpumask_t *cpus) { return -ENODEV; } -static inline void qcom_scm_cpu_power_down(u32 flags) {} -static inline u32 qcom_scm_set_remote_state(u32 state,u32 id) - { return -ENODEV; } - -static inline int qcom_scm_pas_init_image(u32 peripheral, const void *metadata, - size_t size) { return -ENODEV; } -static inline int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr, - phys_addr_t size) { return -ENODEV; } -static inline int qcom_scm_pas_auth_and_reset(u32 peripheral) - { return -ENODEV; } -static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; } -static inline bool qcom_scm_pas_supported(u32 peripheral) { return false; } - -static inline int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val) - { return -ENODEV; } -static inline int qcom_scm_io_writel(phys_addr_t addr, unsigned int val) - { return -ENODEV; } - -static inline bool qcom_scm_restore_sec_cfg_available(void) { return false; } -static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare) - { return -ENODEV; } -static inline int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size) - { return -ENODEV; } -static inline int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare) - { return -ENODEV; } -extern inline int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size, - u32 cp_nonpixel_start, - u32 cp_nonpixel_size) - { return -ENODEV; } -static inline int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, - unsigned int *src, const struct qcom_scm_vmperm *newvm, - unsigned int dest_cnt) { return -ENODEV; } - -static inline bool qcom_scm_ocmem_lock_available(void) { return false; } -static inline int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset, - u32 size, u32 mode) { return -ENODEV; } -static inline int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id, - u32 offset, u32 size) { return -ENODEV; } - -static inline bool qcom_scm_ice_available(void) { return false; } -static inline int qcom_scm_ice_invalidate_key(u32 index) { return -ENODEV; } -static inline int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size, - enum qcom_scm_ice_cipher cipher, - u32 data_unit_size) { return -ENODEV; } - -static inline bool qcom_scm_hdcp_available(void) { return false; } -static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, - u32 *resp) { return -ENODEV; } - -static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en) - { return -ENODEV; } - -static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, - u64 limit_node, u32 node_id, u64 version) - { return -ENODEV; } - -static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; } - -static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; } -#endif #endif diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c976cc6de257..e704b1a4c06c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -235,7 +235,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); * * returns the number of cleaned PTEs. */ -int page_mkclean(struct page *); +int folio_mkclean(struct folio *); /* * called in munlock()/munmap() path to check for other vmas holding @@ -295,12 +295,14 @@ static inline void try_to_unmap(struct page *page, enum ttu_flags flags) { } -static inline int page_mkclean(struct page *page) +static inline int folio_mkclean(struct folio *folio) { return 0; } - - #endif /* CONFIG_MMU */ +static inline int page_mkclean(struct page *page) +{ + return folio_mkclean(page_folio(page)); +} #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 7ce9a51ae5c0..2c0ad417ce3c 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -30,31 +30,16 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_read_lock(rwlock_t *lock) __acquires(lock); -#define do_raw_read_lock_flags(lock, flags) do_raw_read_lock(lock) extern int do_raw_read_trylock(rwlock_t *lock); extern void do_raw_read_unlock(rwlock_t *lock) __releases(lock); extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock); -#define do_raw_write_lock_flags(lock, flags) do_raw_write_lock(lock) extern int do_raw_write_trylock(rwlock_t *lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else - -#ifndef arch_read_lock_flags -# define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#endif - -#ifndef arch_write_lock_flags -# define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#endif - # define do_raw_read_lock(rwlock) do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_read_lock_flags(lock, flags) \ - do {__acquire(lock); arch_read_lock_flags(&(lock)->raw_lock, *(flags)); } while (0) # define do_raw_read_trylock(rwlock) arch_read_trylock(&(rwlock)->raw_lock) # define do_raw_read_unlock(rwlock) do {arch_read_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) # define do_raw_write_lock(rwlock) do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_write_lock_flags(lock, flags) \ - do {__acquire(lock); arch_write_lock_flags(&(lock)->raw_lock, *(flags)); } while (0) # define do_raw_write_trylock(rwlock) arch_write_trylock(&(rwlock)->raw_lock) # define do_raw_write_unlock(rwlock) do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) #endif diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index abfb53ab11be..f1db6f17c4fb 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -157,8 +157,7 @@ static inline unsigned long __raw_read_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_read_trylock, do_raw_read_lock, - do_raw_read_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); return flags; } @@ -184,8 +183,7 @@ static inline unsigned long __raw_write_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_write_trylock, do_raw_write_lock, - do_raw_write_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); return flags; } diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 2713e689ad66..4a6ff274335a 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -427,6 +427,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); int __sbitmap_queue_get(struct sbitmap_queue *sbq); /** + * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits + * @sbq: Bitmap queue to allocate from. + * @nr_tags: number of tags requested + * @offset: offset to add to returned bits + * + * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is + * a bit in the mask returned, and the caller must add @offset to the value to + * get the absolute tag value. + */ +unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, + unsigned int *offset); + +/** * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption * already disabled. @@ -515,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); +/** + * sbitmap_queue_clear_batch() - Free a batch of allocated bits + * &struct sbitmap_queue. + * @sbq: Bitmap to free from. + * @offset: offset for each tag in array + * @tags: array of tags + * @nr_tags: number of tags in array + */ +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, + int *tags, int nr_tags); + static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); diff --git a/include/linux/sched.h b/include/linux/sched.h index 5f8db54226af..f44e14c43b74 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1162,10 +1162,8 @@ struct task_struct { /* Stacked block device info: */ struct bio_list *bio_list; -#ifdef CONFIG_BLOCK /* Stack plugging: */ struct blk_plug *plug; -#endif /* VM state: */ struct reclaim_state *reclaim_state; @@ -2041,7 +2039,7 @@ static inline int _cond_resched(void) { return 0; } #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ #define cond_resched() ({ \ - ___might_sleep(__FILE__, __LINE__, 0); \ + __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) @@ -2049,19 +2047,38 @@ extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); -#define cond_resched_lock(lock) ({ \ - ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ - __cond_resched_lock(lock); \ +#define MIGHT_RESCHED_RCU_SHIFT 8 +#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) + +#ifndef CONFIG_PREEMPT_RT +/* + * Non RT kernels have an elevated preempt count due to the held lock, + * but are not allowed to be inside a RCU read side critical section + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET +#else +/* + * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in + * cond_resched*lock() has to take that into account because it checks for + * preempt_count() and rcu_preempt_depth(). + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS \ + (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) +#endif + +#define cond_resched_lock(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_lock(lock); \ }) -#define cond_resched_rwlock_read(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_read(lock); \ +#define cond_resched_rwlock_read(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_read(lock); \ }) -#define cond_resched_rwlock_write(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_write(lock); \ +#define cond_resched_rwlock_write(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_write(lock); \ }) static inline void cond_resched_rcu(void) diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 21c3771e6a56..988528b5da43 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -23,7 +23,7 @@ static inline bool page_is_secretmem(struct page *page) mapping = (struct address_space *) ((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); - if (mapping != page->mapping) + if (!mapping || mapping != page->mapping) return false; return mapping->a_ops == &secretmem_aops; diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 14ab0c0bc924..1ce9a9eb223b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -128,6 +128,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8371bca13729..6b0b686f6f90 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -531,6 +531,9 @@ struct spi_controller { /* I/O mutex */ struct mutex io_mutex; + /* Used to avoid adding the same CS twice */ + struct mutex add_lock; + /* lock and mutex for SPI bus locking */ spinlock_t bus_lock_spinlock; struct mutex bus_lock_mutex; diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 45310ea1b1d7..f0447062eecd 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -177,7 +177,6 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); -#define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock) extern int do_raw_spin_trylock(raw_spinlock_t *lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else @@ -188,18 +187,6 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) mmiowb_spin_lock(); } -#ifndef arch_spin_lock_flags -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -#endif - -static inline void -do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock) -{ - __acquire(lock); - arch_spin_lock_flags(&lock->raw_lock, *flags); - mmiowb_spin_lock(); -} - static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 6b8e1a0b137b..51fa0dab68c4 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -108,16 +108,7 @@ static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock) local_irq_save(flags); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - /* - * On lockdep we dont want the hand-coded irq-enable of - * do_raw_spin_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#ifdef CONFIG_LOCKDEP LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -#else - do_raw_spin_lock_flags(lock, &flags); -#endif return flags; } diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 0ac9112c1bbe..16521074b6f7 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -62,7 +62,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #define arch_spin_is_locked(lock) ((void)(lock), 0) /* for sched/core.c and kernel_lock.c: */ # define arch_spin_lock(lock) do { barrier(); (void)(lock); } while (0) -# define arch_spin_lock_flags(lock, flags) do { barrier(); (void)(lock); } while (0) # define arch_spin_unlock(lock) do { barrier(); (void)(lock); } while (0) # define arch_spin_trylock(lock) ({ barrier(); (void)(lock); 1; }) #endif /* DEBUG_SPINLOCK */ diff --git a/include/linux/swap.h b/include/linux/swap.h index ba52f3a3478e..cdf0957a88a4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -320,11 +320,17 @@ struct vma_swap_readahead { #endif }; +static inline swp_entry_t folio_swap_entry(struct folio *folio) +{ + swp_entry_t entry = { .val = page_private(&folio->page) }; + return entry; +} + /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); -void workingset_refault(struct page *page, void *shadow); -void workingset_activation(struct page *page); +void workingset_refault(struct folio *folio, void *shadow); +void workingset_activation(struct folio *folio); /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); @@ -344,9 +350,11 @@ extern unsigned long nr_free_buffer_pages(void); /* linux/mm/swap.c */ extern void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); -extern void lru_note_cost_page(struct page *); +extern void lru_note_cost_folio(struct folio *); +extern void folio_add_lru(struct folio *); extern void lru_cache_add(struct page *); -extern void mark_page_accessed(struct page *); +void mark_page_accessed(struct page *); +void folio_mark_accessed(struct folio *); extern atomic_t lru_disable_count; @@ -365,7 +373,6 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 252243c7783d..528a478dbda8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -58,6 +58,7 @@ struct mq_attr; struct compat_stat; struct old_timeval32; struct robust_list_head; +struct futex_waitv; struct getcpu_cache; struct old_linux_dirent; struct perf_event_attr; @@ -610,7 +611,7 @@ asmlinkage long sys_waitid(int which, pid_t pid, asmlinkage long sys_set_tid_address(int __user *tidptr); asmlinkage long sys_unshare(unsigned long unshare_flags); -/* kernel/futex.c */ +/* kernel/futex/syscalls.c */ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, const struct __kernel_timespec __user *utime, u32 __user *uaddr2, u32 val3); @@ -623,6 +624,10 @@ asmlinkage long sys_get_robust_list(int pid, asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, + unsigned int nr_futexes, unsigned int flags, + struct __kernel_timespec __user *timeout, clockid_t clockid); + /* kernel/hrtimer.c */ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 96305a64a5a7..c635c2e014e3 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -3,7 +3,7 @@ #define _LINUX_T10_PI_H #include <linux/types.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> /* * A T10 PI-capable target device can be formatted with different diff --git a/include/linux/tpm.h b/include/linux/tpm.h index aa11fe323c56..12d827734686 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -269,6 +269,7 @@ enum tpm2_cc_attrs { #define TPM_VID_INTEL 0x8086 #define TPM_VID_WINBOND 0x1050 #define TPM_VID_STM 0x104A +#define TPM_VID_ATML 0x1114 enum tpm_chip_flags { TPM_CHIP_FLAG_TPM2 = BIT(1), diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index a9f9c5714e65..fe95f0922526 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -16,23 +16,8 @@ * When function tracing occurs, the following steps are made: * If arch does not support a ftrace feature: * call internal function (uses INTERNAL bits) which calls... - * If callback is registered to the "global" list, the list - * function is called and recursion checks the GLOBAL bits. - * then this function calls... * The function callback, which can use the FTRACE bits to * check for recursion. - * - * Now if the arch does not support a feature, and it calls - * the global list function which calls the ftrace callback - * all three of these steps will do a recursion protection. - * There's no reason to do one if the previous caller already - * did. The recursion that we are protecting against will - * go through the same steps again. - * - * To prevent the multiple recursion checks, if a recursion - * bit is set that is higher than the MAX bit of the current - * check, then we know that the check was made by the previous - * caller, and we can skip the current check. */ enum { /* Function recursion bits */ @@ -40,12 +25,14 @@ enum { TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, + TRACE_FTRACE_TRANSITION_BIT, - /* INTERNAL_BITs must be greater than FTRACE_BITs */ + /* Internal use recursion bits */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, + TRACE_INTERNAL_TRANSITION_BIT, TRACE_BRANCH_BIT, /* @@ -86,12 +73,6 @@ enum { */ TRACE_GRAPH_NOTRACE_BIT, - /* - * When transitioning between context, the preempt_count() may - * not be correct. Allow for a single recursion to cover this case. - */ - TRACE_TRANSITION_BIT, - /* Used to prevent recursion recording from recursing. */ TRACE_RECORD_RECURSION_BIT, }; @@ -113,12 +94,10 @@ enum { #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT -#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) #define TRACE_LIST_START TRACE_INTERNAL_BIT -#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_CONTEXT_MASK TRACE_LIST_MAX +#define TRACE_CONTEXT_MASK ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) /* * Used for setting context @@ -132,6 +111,7 @@ enum { TRACE_CTX_IRQ, TRACE_CTX_SOFTIRQ, TRACE_CTX_NORMAL, + TRACE_CTX_TRANSITION, }; static __always_inline int trace_get_context_bit(void) @@ -160,45 +140,34 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); #endif static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, - int start, int max) + int start) { unsigned int val = READ_ONCE(current->trace_recursion); int bit; - /* A previous recursion check was made */ - if ((val & TRACE_CONTEXT_MASK) > max) - return 0; - bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* * It could be that preempt_count has not been updated during * a switch between contexts. Allow for a single recursion. */ - bit = TRACE_TRANSITION_BIT; + bit = TRACE_CTX_TRANSITION + start; if (val & (1 << bit)) { do_ftrace_record_recursion(ip, pip); return -1; } - } else { - /* Normal check passed, clear the transition to allow it again */ - val &= ~(1 << TRACE_TRANSITION_BIT); } val |= 1 << bit; current->trace_recursion = val; barrier(); - return bit + 1; + return bit; } static __always_inline void trace_clear_recursion(int bit) { - if (!bit) - return; - barrier(); - bit--; trace_recursion_clear(bit); } @@ -214,7 +183,7 @@ static __always_inline void trace_clear_recursion(int bit) static __always_inline int ftrace_test_recursion_trylock(unsigned long ip, unsigned long parent_ip) { - return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX); + return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START); } /** diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index eb70cabe6e7f..33a4240e6a6f 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -127,6 +127,8 @@ static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type t long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type); +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type); bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); static inline void set_rlimit_ucount_max(struct user_namespace *ns, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d6a6cf53b127..bfe38869498d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -415,6 +415,78 @@ static inline void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ +static inline void __zone_stat_mod_folio(struct folio *folio, + enum zone_stat_item item, long nr) +{ + __mod_zone_page_state(folio_zone(folio), item, nr); +} + +static inline void __zone_stat_add_folio(struct folio *folio, + enum zone_stat_item item) +{ + __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); +} + +static inline void __zone_stat_sub_folio(struct folio *folio, + enum zone_stat_item item) +{ + __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); +} + +static inline void zone_stat_mod_folio(struct folio *folio, + enum zone_stat_item item, long nr) +{ + mod_zone_page_state(folio_zone(folio), item, nr); +} + +static inline void zone_stat_add_folio(struct folio *folio, + enum zone_stat_item item) +{ + mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); +} + +static inline void zone_stat_sub_folio(struct folio *folio, + enum zone_stat_item item) +{ + mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); +} + +static inline void __node_stat_mod_folio(struct folio *folio, + enum node_stat_item item, long nr) +{ + __mod_node_page_state(folio_pgdat(folio), item, nr); +} + +static inline void __node_stat_add_folio(struct folio *folio, + enum node_stat_item item) +{ + __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); +} + +static inline void __node_stat_sub_folio(struct folio *folio, + enum node_stat_item item) +{ + __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); +} + +static inline void node_stat_mod_folio(struct folio *folio, + enum node_stat_item item, long nr) +{ + mod_node_page_state(folio_pgdat(folio), item, nr); +} + +static inline void node_stat_add_folio(struct folio *folio, + enum node_stat_item item) +{ + mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); +} + +static inline void node_stat_sub_folio(struct folio *folio, + enum node_stat_item item) +{ + mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); +} + static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { @@ -525,12 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, 1); -} - static inline void __inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { @@ -543,6 +609,24 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } +static inline void __lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + __mod_lruvec_page_state(&folio->page, idx, val); +} + +static inline void __lruvec_stat_add_folio(struct folio *folio, + enum node_stat_item idx) +{ + __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); +} + +static inline void __lruvec_stat_sub_folio(struct folio *folio, + enum node_stat_item idx) +{ + __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); +} + static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { @@ -555,4 +639,21 @@ static inline void dec_lruvec_page_state(struct page *page, mod_lruvec_page_state(page, idx, -1); } +static inline void lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + mod_lruvec_page_state(&folio->page, idx, val); +} + +static inline void lruvec_stat_add_folio(struct folio *folio, + enum node_stat_item idx) +{ + lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); +} + +static inline void lruvec_stat_sub_folio(struct folio *folio, + enum node_stat_item idx) +{ + lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); +} #endif /* _LINUX_VMSTAT_H */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2ebef6b1a3d6..74d3c1efd9bb 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -399,9 +399,8 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...); +__printf(1, 4) struct workqueue_struct * +alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d1f65adf6a26..3bfd487d1dd2 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,7 +11,6 @@ #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> #include <linux/blk_types.h> -#include <linux/blk-cgroup.h> struct bio; @@ -109,15 +108,12 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc) return flags; } -static inline struct cgroup_subsys_state * -wbc_blkcg_css(struct writeback_control *wbc) -{ #ifdef CONFIG_CGROUP_WRITEBACK - if (wbc->wb) - return wbc->wb->blkcg_css; -#endif - return blkcg_root_css; -} +#define wbc_blkcg_css(wbc) \ + ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css) +#else +#define wbc_blkcg_css(wbc) (blkcg_root_css) +#endif /* CONFIG_CGROUP_WRITEBACK */ /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to @@ -393,7 +389,14 @@ void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); -void account_page_redirty(struct page *page); +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); +void folio_account_redirty(struct folio *folio); +static inline void account_page_redirty(struct page *page) +{ + folio_account_redirty(page_folio(page)); +} +bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); +bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 29db736af86d..bb763085479a 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -28,12 +28,10 @@ #ifndef CONFIG_PREEMPT_RT #define WW_MUTEX_BASE mutex #define ww_mutex_base_init(l,n,k) __mutex_init(l,n,k) -#define ww_mutex_base_trylock(l) mutex_trylock(l) #define ww_mutex_base_is_locked(b) mutex_is_locked((b)) #else #define WW_MUTEX_BASE rt_mutex #define ww_mutex_base_init(l,n,k) __rt_mutex_init(l,n,k) -#define ww_mutex_base_trylock(l) rt_mutex_trylock(l) #define ww_mutex_base_is_locked(b) rt_mutex_base_is_locked(&(b)->rtmutex) #endif @@ -339,17 +337,8 @@ ww_mutex_lock_slow_interruptible(struct ww_mutex *lock, extern void ww_mutex_unlock(struct ww_mutex *lock); -/** - * ww_mutex_trylock - tries to acquire the w/w mutex without acquire context - * @lock: mutex to lock - * - * Trylocks a mutex without acquire context, so no deadlock detection is - * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise. - */ -static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock) -{ - return ww_mutex_base_trylock(&lock->base); -} +extern int __must_check ww_mutex_trylock(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx); /*** * ww_mutex_destroy - mark a w/w mutex unusable diff --git a/include/linux/xz.h b/include/linux/xz.h index 9884c8440188..7285ca5d56e9 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -234,6 +234,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s); XZ_EXTERN void xz_dec_end(struct xz_dec *s); /* + * Decompressor for MicroLZMA, an LZMA variant with a very minimal header. + * See xz_dec_microlzma_alloc() below for details. + * + * These functions aren't used or available in preboot code and thus aren't + * marked with XZ_EXTERN. This avoids warnings about static functions that + * are never defined. + */ +/** + * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state + */ +struct xz_dec_microlzma; + +/** + * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder + * @mode XZ_SINGLE or XZ_PREALLOC + * @dict_size LZMA dictionary size. This must be at least 4 KiB and + * at most 3 GiB. + * + * In contrast to xz_dec_init(), this function only allocates the memory + * and remembers the dictionary size. xz_dec_microlzma_reset() must be used + * before calling xz_dec_microlzma_run(). + * + * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE. + * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated. + * + * On success, xz_dec_microlzma_alloc() returns a pointer to + * struct xz_dec_microlzma. If memory allocation fails or + * dict_size is invalid, NULL is returned. + * + * The compressed format supported by this decoder is a raw LZMA stream + * whose first byte (always 0x00) has been replaced with bitwise-negation + * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is + * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. + * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream + * marker must not be used. The unused values are reserved for future use. + * This MicroLZMA header format was created for use in EROFS but may be used + * by others too. + */ +extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size); + +/** + * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state + * @s Decoder state allocated using xz_dec_microlzma_alloc() + * @comp_size Compressed size of the input stream + * @uncomp_size Uncompressed size of the input stream. A value smaller + * than the real uncompressed size of the input stream can + * be specified if uncomp_size_is_exact is set to false. + * uncomp_size can never be set to a value larger than the + * expected real uncompressed size because it would eventually + * result in XZ_DATA_ERROR. + * @uncomp_size_is_exact This is an int instead of bool to avoid + * requiring stdbool.h. This should normally be set to true. + * When this is set to false, error detection is weaker. + */ +extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, + uint32_t comp_size, uint32_t uncomp_size, + int uncomp_size_is_exact); + +/** + * xz_dec_microlzma_run() - Run the MicroLZMA decoder + * @s Decoder state initialized using xz_dec_microlzma_reset() + * @b: Input and output buffers + * + * This works similarly to xz_dec_run() with a few important differences. + * Only the differences are documented here. + * + * The only possible return values are XZ_OK, XZ_STREAM_END, and + * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress + * is possible due to lack of input data or output space, this function will + * keep returning XZ_OK. Thus, the calling code must be written so that it + * will eventually provide input and output space matching (or exceeding) + * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset(). + * If the caller cannot do this (for example, if the input file is truncated + * or otherwise corrupt), the caller must detect this error by itself to + * avoid an infinite loop. + * + * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned. + * This can happen also when incorrect dictionary, uncompressed, or + * compressed sizes have been specified. + * + * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over + * uncompressed data. This way the caller doesn't need to provide a temporary + * output buffer for the bytes that will be ignored. + * + * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK + * is also possible and thus XZ_SINGLE is actually a limited multi-call mode. + * After XZ_OK the bytes decoded so far may be read from the output buffer. + * It is possible to continue decoding but the variables b->out and b->out_pos + * MUST NOT be changed by the caller. Increasing the value of b->out_size is + * allowed to make more output space available; one doesn't need to provide + * space for the whole uncompressed data on the first call. The input buffer + * may be changed normally like with XZ_PREALLOC. This way input data can be + * provided from non-contiguous memory. + */ +extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, + struct xz_buf *b); + +/** + * xz_dec_microlzma_end() - Free the memory allocated for the decoder state + * @s: Decoder state allocated using xz_dec_microlzma_alloc(). + * If s is NULL, this function does nothing. + */ +extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); + +/* * Standalone build (userspace build or in-kernel build for boot time use) * needs a CRC32 implementation. For normal in-kernel use, kernel's own * CRC32 module is used instead, and users of this module don't need to diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 62dd8422e0dc..27336fc70467 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5376,7 +5376,6 @@ static inline void wiphy_unlock(struct wiphy *wiphy) * netdev and may otherwise be used by driver read-only, will be update * by cfg80211 on change_interface * @mgmt_registrations: list of registrations for management frames - * @mgmt_registrations_lock: lock for the list * @mgmt_registrations_need_update: mgmt registrations were updated, * need to propagate the update to the driver * @mtx: mutex used to lock data in this struct, may be used by drivers @@ -5423,7 +5422,6 @@ struct wireless_dev { u32 identifier; struct list_head mgmt_registrations; - spinlock_t mgmt_registrations_lock; u8 mgmt_registrations_need_update:1; struct mutex mtx; diff --git a/include/net/mctp.h b/include/net/mctp.h index a824d47c3c6d..ffd2c23bd76d 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -54,7 +54,7 @@ struct mctp_sock { struct sock sk; /* bind() params */ - int bind_net; + unsigned int bind_net; mctp_eid_t bind_addr; __u8 bind_type; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 6026bbefbffd..3214848402ec 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -69,6 +69,10 @@ struct mptcp_out_options { struct { u64 sndr_key; u64 rcvr_key; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + __sum16 csum; }; struct { struct mptcp_addr_info addr; diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index 0fd8a4159662..ceadf8ba25a4 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -17,7 +17,6 @@ struct inet_frags_ctl; struct nft_ct_frag6_pernet { struct ctl_table_header *nf_frag_frags_hdr; struct fqdir *fqdir; - unsigned int users; }; #endif /* _NF_DEFRAG_IPV6_H */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 148f5d8ee5ab..a16171c5fd9e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1202,7 +1202,7 @@ struct nft_object *nft_obj_lookup(const struct net *net, void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, - int event, int family, int report, gfp_t gfp); + int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 986a2a9cfdfa..b593f95e9991 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -27,5 +27,11 @@ struct netns_nf { #if IS_ENABLED(CONFIG_DECNET) struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; #endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + unsigned int defrag_ipv4_users; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + unsigned int defrag_ipv6_users; +#endif }; #endif diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2eb6d7c2c931..f37c7a558d6d 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -384,11 +384,11 @@ sctp_vtag_verify(const struct sctp_chunk *chunk, * Verification Tag value does not match the receiver's own * tag value, the receiver shall silently discard the packet... */ - if (ntohl(chunk->sctp_hdr->vtag) == asoc->c.my_vtag) - return 1; + if (ntohl(chunk->sctp_hdr->vtag) != asoc->c.my_vtag) + return 0; chunk->transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; - return 0; + return 1; } /* Check VTAG of the packet matches the sender's own tag and the T bit is diff --git a/include/net/sock.h b/include/net/sock.h index ae929e21a376..463f390d90b3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -307,6 +307,7 @@ struct bpf_local_storage; * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family + * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting @@ -1207,7 +1208,7 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk, int wake); - bool (*stream_memory_read)(const struct sock *sk); + bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); @@ -2819,4 +2820,10 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); +static inline bool sk_is_readable(struct sock *sk) +{ + if (sk->sk_prot->sock_is_readable) + return sk->sk_prot->sock_is_readable(sk); + return false; +} #endif /* _SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 3166dc15d7d6..60c384569e9c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1576,6 +1576,7 @@ struct tcp_md5sig_key { u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; + u8 flags; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; @@ -1621,10 +1622,10 @@ struct tcp_md5sig_pool { int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index, + int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index); + int family, u8 prefixlen, int l3index, u8 flags); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); diff --git a/include/net/tls.h b/include/net/tls.h index be4b3e1cac46..1fffb206f09f 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -358,6 +358,7 @@ int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); +void tls_err_abort(struct sock *sk, int err); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); @@ -375,7 +376,7 @@ void tls_sw_release_resources_rx(struct sock *sk); void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -bool tls_sw_stream_read(const struct sock *sk); +bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); @@ -466,12 +467,6 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) #endif } -static inline void tls_err_abort(struct sock *sk, int err) -{ - sk->sk_err = err; - sk_error_report(sk); -} - static inline bool tls_bigint_increment(unsigned char *seq, int len) { int i; @@ -512,7 +507,7 @@ static inline void tls_advance_record_sn(struct sock *sk, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) diff --git a/include/net/udp.h b/include/net/udp.h index 360df454356c..909ecf447e0f 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -494,8 +494,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit - * their ip_summed is set to CHECKSUM_UNNECESSARY. Reset in this - * specific case, where PARTIAL is both correct and required. + * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. + * Reset in this specific case, where PARTIAL is both correct and + * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index eaf04c9a1dfc..31078063afac 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -396,4 +396,7 @@ static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq); +struct request *scsi_alloc_request(struct request_queue *q, + unsigned int op, blk_mq_req_flags_t flags); + #endif /* _SCSI_SCSI_CMND_H */ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index b97e142a7ca9..430b73bd02ac 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -5,7 +5,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <scsi/scsi.h> #include <linux/atomic.h> #include <linux/sbitmap.h> diff --git a/include/soc/arc/timers.h b/include/soc/arc/timers.h index 7ecde3b159c8..ae99d3e855f1 100644 --- a/include/soc/arc/timers.h +++ b/include/soc/arc/timers.h @@ -17,8 +17,8 @@ #define ARC_REG_TIMER1_CNT 0x100 /* timer 1 count */ /* CTRL reg bits */ -#define TIMER_CTRL_IE (1 << 0) /* Interrupt when Count reaches limit */ -#define TIMER_CTRL_NH (1 << 1) /* Count only when CPU NOT halted */ +#define ARC_TIMER_CTRL_IE (1 << 0) /* Interrupt when Count reaches limit */ +#define ARC_TIMER_CTRL_NH (1 << 1) /* Count only when CPU NOT halted */ #define ARC_TIMERN_MAX 0xFFFFFFFF diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 06706a9fd5b1..d7055b41982d 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -89,15 +89,6 @@ /* Source PGIDs, one per physical port */ #define PGID_SRC 80 -#define IFH_TAG_TYPE_C 0 -#define IFH_TAG_TYPE_S 1 - -#define IFH_REW_OP_NOOP 0x0 -#define IFH_REW_OP_DSCP 0x1 -#define IFH_REW_OP_ONE_STEP_PTP 0x2 -#define IFH_REW_OP_TWO_STEP_PTP 0x3 -#define IFH_REW_OP_ORIGIN_PTP 0x5 - #define OCELOT_NUM_TC 8 #define OCELOT_SPEED_2500 0 @@ -603,10 +594,10 @@ struct ocelot_port { /* The VLAN ID that will be transmitted as untagged, on egress */ struct ocelot_vlan native_vlan; + unsigned int ptp_skbs_in_flight; u8 ptp_cmd; struct sk_buff_head tx_skbs; u8 ts_id; - spinlock_t ts_id_lock; phy_interface_t phy_mode; @@ -680,6 +671,9 @@ struct ocelot { struct ptp_clock *ptp_clock; struct ptp_clock_info ptp_info; struct hwtstamp_config hwtstamp_config; + unsigned int ptp_skbs_in_flight; + /* Protects the 2-step TX timestamp ID logic */ + spinlock_t ts_id_lock; /* Protects the PTP interface state */ struct mutex ptp_lock; /* Protects the PTP clock */ @@ -692,15 +686,6 @@ struct ocelot_policer { u32 burst; /* bytes */ }; -struct ocelot_skb_cb { - struct sk_buff *clone; - u8 ptp_cmd; - u8 ts_id; -}; - -#define OCELOT_SKB_CB(skb) \ - ((struct ocelot_skb_cb *)((skb)->cb)) - #define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) #define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi)) #define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri)) @@ -752,8 +737,6 @@ u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, u32 val, u32 reg, u32 offset); -#if IS_ENABLED(CONFIG_MSCC_OCELOT_SWITCH_LIB) - /* Packet I/O */ bool ocelot_can_inject(struct ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, @@ -761,36 +744,6 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb); void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp); -u32 ocelot_ptp_rew_op(struct sk_buff *skb); -#else - -static inline bool ocelot_can_inject(struct ocelot *ocelot, int grp) -{ - return false; -} - -static inline void ocelot_port_inject_frame(struct ocelot *ocelot, int port, - int grp, u32 rew_op, - struct sk_buff *skb) -{ -} - -static inline int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, - struct sk_buff **skb) -{ - return -EIO; -} - -static inline void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) -{ -} - -static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) -{ - return 0; -} -#endif - /* Hardware initialization */ int ocelot_regfields_init(struct ocelot *ocelot, const struct reg_field *const regfields); diff --git a/include/soc/mscc/ocelot_ptp.h b/include/soc/mscc/ocelot_ptp.h index ded497d72bdb..f085884b1fa2 100644 --- a/include/soc/mscc/ocelot_ptp.h +++ b/include/soc/mscc/ocelot_ptp.h @@ -13,6 +13,9 @@ #include <linux/ptp_clock_kernel.h> #include <soc/mscc/ocelot.h> +#define OCELOT_MAX_PTP_ID 63 +#define OCELOT_PTP_FIFO_SIZE 128 + #define PTP_PIN_CFG_RSZ 0x20 #define PTP_PIN_TOD_SEC_MSB_RSZ PTP_PIN_CFG_RSZ #define PTP_PIN_TOD_SEC_LSB_RSZ PTP_PIN_CFG_RSZ diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 25fd525aaf92..4869ebbd438d 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -694,7 +694,7 @@ int ocelot_vcap_filter_add(struct ocelot *ocelot, int ocelot_vcap_filter_del(struct ocelot *ocelot, struct ocelot_vcap_filter *rule); struct ocelot_vcap_filter * -ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int id, - bool tc_offload); +ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, + unsigned long cookie, bool tc_offload); #endif /* _OCELOT_VCAP_H_ */ diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h index 01570dbda503..0e45963bb767 100644 --- a/include/sound/hda_codec.h +++ b/include/sound/hda_codec.h @@ -224,6 +224,7 @@ struct hda_codec { #endif /* misc flags */ + unsigned int configured:1; /* codec was configured */ unsigned int in_freeing:1; /* being released */ unsigned int registered:1; /* codec was registered */ unsigned int display_power_control:1; /* needs display power */ diff --git a/include/trace/events/block.h b/include/trace/events/block.h index cc5ab96a7471..a95daa4d4caa 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -114,7 +114,7 @@ TRACE_EVENT(block_rq_requeue, */ TRACE_EVENT(block_rq_complete, - TP_PROTO(struct request *rq, int error, unsigned int nr_bytes), + TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes), @@ -122,7 +122,7 @@ TRACE_EVENT(block_rq_complete, __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) - __field( int, error ) + __field( int , error ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), @@ -131,7 +131,7 @@ TRACE_EVENT(block_rq_complete, __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; - __entry->error = error; + __entry->error = blk_status_to_errno(error); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 695bfdbfdcad..920b6a303d60 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -178,7 +178,7 @@ TRACE_EVENT(cachefiles_unlink, ), TP_fast_assign( - __entry->obj = obj->fscache.debug_id; + __entry->obj = obj ? obj->fscache.debug_id : UINT_MAX; __entry->de = de; __entry->why = why; ), @@ -205,7 +205,7 @@ TRACE_EVENT(cachefiles_rename, ), TP_fast_assign( - __entry->obj = obj->fscache.debug_id; + __entry->obj = obj ? obj->fscache.debug_id : UINT_MAX; __entry->de = de; __entry->to = to; __entry->why = why; diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index db4f2cec8360..16ae7b666810 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -24,7 +24,7 @@ struct erofs_map_blocks; #define show_mflags(flags) __print_flags(flags, "", \ { EROFS_MAP_MAPPED, "M" }, \ { EROFS_MAP_META, "I" }, \ - { EROFS_MAP_ZIPPED, "Z" }) + { EROFS_MAP_ENCODED, "E" }) TRACE_EVENT(erofs_lookup, diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 0dd30de00e5b..7346f0164cf4 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -6,6 +6,7 @@ #define _TRACE_IO_URING_H #include <linux/tracepoint.h> +#include <uapi/linux/io_uring.h> struct io_wq_work; @@ -497,6 +498,66 @@ TRACE_EVENT(io_uring_task_run, (unsigned long long) __entry->user_data) ); +/* + * io_uring_req_failed - called when an sqe is errored dring submission + * + * @sqe: pointer to the io_uring_sqe that failed + * @error: error it failed with + * + * Allows easier diagnosing of malformed requests in production systems. + */ +TRACE_EVENT(io_uring_req_failed, + + TP_PROTO(const struct io_uring_sqe *sqe, int error), + + TP_ARGS(sqe, error), + + TP_STRUCT__entry ( + __field( u8, opcode ) + __field( u8, flags ) + __field( u8, ioprio ) + __field( u64, off ) + __field( u64, addr ) + __field( u32, len ) + __field( u32, op_flags ) + __field( u64, user_data ) + __field( u16, buf_index ) + __field( u16, personality ) + __field( u32, file_index ) + __field( u64, pad1 ) + __field( u64, pad2 ) + __field( int, error ) + ), + + TP_fast_assign( + __entry->opcode = sqe->opcode; + __entry->flags = sqe->flags; + __entry->ioprio = sqe->ioprio; + __entry->off = sqe->off; + __entry->addr = sqe->addr; + __entry->len = sqe->len; + __entry->op_flags = sqe->rw_flags; + __entry->user_data = sqe->user_data; + __entry->buf_index = sqe->buf_index; + __entry->personality = sqe->personality; + __entry->file_index = sqe->file_index; + __entry->pad1 = sqe->__pad2[0]; + __entry->pad2 = sqe->__pad2[1]; + __entry->error = error; + ), + + TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, " + "len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, " + "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", + __entry->opcode, __entry->flags, __entry->ioprio, + (unsigned long long)__entry->off, + (unsigned long long) __entry->addr, __entry->len, + __entry->op_flags, (unsigned long long) __entry->user_data, + __entry->buf_index, __entry->personality, __entry->file_index, + (unsigned long long) __entry->pad1, + (unsigned long long) __entry->pad2, __entry->error) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h index 491098a0d8ed..bf7533f171ff 100644 --- a/include/trace/events/kyber.h +++ b/include/trace/events/kyber.h @@ -13,11 +13,11 @@ TRACE_EVENT(kyber_latency, - TP_PROTO(struct request_queue *q, const char *domain, const char *type, + TP_PROTO(dev_t dev, const char *domain, const char *type, unsigned int percentile, unsigned int numerator, unsigned int denominator, unsigned int samples), - TP_ARGS(q, domain, type, percentile, numerator, denominator, samples), + TP_ARGS(dev, domain, type, percentile, numerator, denominator, samples), TP_STRUCT__entry( __field( dev_t, dev ) @@ -30,7 +30,7 @@ TRACE_EVENT(kyber_latency, ), TP_fast_assign( - __entry->dev = disk_devt(q->disk); + __entry->dev = dev; strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strlcpy(__entry->type, type, sizeof(__entry->type)); __entry->percentile = percentile; @@ -47,10 +47,9 @@ TRACE_EVENT(kyber_latency, TRACE_EVENT(kyber_adjust, - TP_PROTO(struct request_queue *q, const char *domain, - unsigned int depth), + TP_PROTO(dev_t dev, const char *domain, unsigned int depth), - TP_ARGS(q, domain, depth), + TP_ARGS(dev, domain, depth), TP_STRUCT__entry( __field( dev_t, dev ) @@ -59,7 +58,7 @@ TRACE_EVENT(kyber_adjust, ), TP_fast_assign( - __entry->dev = disk_devt(q->disk); + __entry->dev = dev; strlcpy(__entry->domain, domain, sizeof(__entry->domain)); __entry->depth = depth; ), @@ -71,9 +70,9 @@ TRACE_EVENT(kyber_adjust, TRACE_EVENT(kyber_throttled, - TP_PROTO(struct request_queue *q, const char *domain), + TP_PROTO(dev_t dev, const char *domain), - TP_ARGS(q, domain), + TP_ARGS(dev, domain), TP_STRUCT__entry( __field( dev_t, dev ) @@ -81,7 +80,7 @@ TRACE_EVENT(kyber_throttled, ), TP_fast_assign( - __entry->dev = disk_devt(q->disk); + __entry->dev = dev; strlcpy(__entry->domain, domain, sizeof(__entry->domain)); ), diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index 1d28431e85bd..171524d3526d 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h @@ -16,38 +16,38 @@ #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u -#define trace_pagemap_flags(page) ( \ - (PageAnon(page) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ - (page_mapped(page) ? PAGEMAP_MAPPED : 0) | \ - (PageSwapCache(page) ? PAGEMAP_SWAPCACHE : 0) | \ - (PageSwapBacked(page) ? PAGEMAP_SWAPBACKED : 0) | \ - (PageMappedToDisk(page) ? PAGEMAP_MAPPEDDISK : 0) | \ - (page_has_private(page) ? PAGEMAP_BUFFERS : 0) \ +#define trace_pagemap_flags(folio) ( \ + (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ + (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ + (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ + (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ + (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ + (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, - TP_PROTO(struct page *page), + TP_PROTO(struct folio *folio), - TP_ARGS(page), + TP_ARGS(folio), TP_STRUCT__entry( - __field(struct page *, page ) + __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( - __entry->page = page; - __entry->pfn = page_to_pfn(page); - __entry->lru = page_lru(page); - __entry->flags = trace_pagemap_flags(page); + __entry->folio = folio; + __entry->pfn = folio_pfn(folio); + __entry->lru = folio_lru_list(folio); + __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ - TP_printk("page=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", - __entry->page, + TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", + __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", @@ -60,23 +60,21 @@ TRACE_EVENT(mm_lru_insertion, TRACE_EVENT(mm_lru_activate, - TP_PROTO(struct page *page), + TP_PROTO(struct folio *folio), - TP_ARGS(page), + TP_ARGS(folio), TP_STRUCT__entry( - __field(struct page *, page ) + __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( - __entry->page = page; - __entry->pfn = page_to_pfn(page); + __entry->folio = folio; + __entry->pfn = folio_pfn(folio); ), - /* Flag format is based on page-types.c formatting for pagemap */ - TP_printk("page=%p pfn=0x%lx", __entry->page, __entry->pfn) - + TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 840d1ba84cf5..7dccb66474f7 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -52,11 +52,11 @@ WB_WORK_REASON struct wb_writeback_work; -DECLARE_EVENT_CLASS(writeback_page_template, +DECLARE_EVENT_CLASS(writeback_folio_template, - TP_PROTO(struct page *page, struct address_space *mapping), + TP_PROTO(struct folio *folio, struct address_space *mapping), - TP_ARGS(page, mapping), + TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) @@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = mapping ? mapping->host->i_ino : 0; - __entry->index = page->index; + __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", @@ -79,18 +79,18 @@ DECLARE_EVENT_CLASS(writeback_page_template, ) ); -DEFINE_EVENT(writeback_page_template, writeback_dirty_page, +DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, - TP_PROTO(struct page *page, struct address_space *mapping), + TP_PROTO(struct folio *folio, struct address_space *mapping), - TP_ARGS(page, mapping) + TP_ARGS(folio, mapping) ); -DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, +DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, - TP_PROTO(struct page *page, struct address_space *mapping), + TP_PROTO(struct folio *folio, struct address_space *mapping), - TP_ARGS(page, mapping) + TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, @@ -236,9 +236,9 @@ TRACE_EVENT(inode_switch_wbs, TRACE_EVENT(track_foreign_dirty, - TP_PROTO(struct page *page, struct bdi_writeback *wb), + TP_PROTO(struct folio *folio, struct bdi_writeback *wb), - TP_ARGS(page, wb), + TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) @@ -250,7 +250,7 @@ TRACE_EVENT(track_foreign_dirty, ), TP_fast_assign( - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); @@ -258,7 +258,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); + __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 9dc0bf0c5a6e..ecd0f5bdfc1d 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -181,6 +181,10 @@ struct f_owner_ex { blocking */ #define LOCK_UN 8 /* remove lock */ +/* + * LOCK_MAND support has been removed from the kernel. We leave the symbols + * here to not break legacy builds, but these should not be used in new code. + */ #define LOCK_MAND 32 /* This is a mandatory flock ... */ #define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 1c5fb86d455a..4557a8b6086f 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -880,8 +880,11 @@ __SYSCALL(__NR_memfd_secret, sys_memfd_secret) #define __NR_process_mrelease 448 __SYSCALL(__NR_process_mrelease, sys_process_mrelease) +#define __NR_futex_waitv 449 +__SYSCALL(__NR_futex_waitv, sys_futex_waitv) + #undef __NR_syscalls -#define __NR_syscalls 449 +#define __NR_syscalls 450 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index d7d3cfead056..738619994e26 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -771,10 +771,16 @@ struct btrfs_ioctl_received_subvol_args { */ #define BTRFS_SEND_FLAG_OMIT_END_CMD 0x4 +/* + * Read the protocol version in the structure + */ +#define BTRFS_SEND_FLAG_VERSION 0x8 + #define BTRFS_SEND_FLAG_MASK \ (BTRFS_SEND_FLAG_NO_FILE_DATA | \ BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \ - BTRFS_SEND_FLAG_OMIT_END_CMD) + BTRFS_SEND_FLAG_OMIT_END_CMD | \ + BTRFS_SEND_FLAG_VERSION) struct btrfs_ioctl_send_args { __s64 send_fd; /* in */ @@ -782,7 +788,8 @@ struct btrfs_ioctl_send_args { __u64 __user *clone_sources; /* in */ __u64 parent_root; /* in */ __u64 flags; /* in */ - __u64 reserved[4]; /* in */ + __u32 version; /* in */ + __u8 reserved[28]; /* in */ }; /* diff --git a/include/uapi/linux/cdrom.h b/include/uapi/linux/cdrom.h index 6c34f6e2f1f7..804ff8d98f71 100644 --- a/include/uapi/linux/cdrom.h +++ b/include/uapi/linux/cdrom.h @@ -147,6 +147,8 @@ #define CDROM_NEXT_WRITABLE 0x5394 /* get next writable block */ #define CDROM_LAST_WRITTEN 0x5395 /* get last block written on disc */ +#define CDROM_TIMED_MEDIA_CHANGE 0x5396 /* get the timestamp of the last media change */ + /******************************************************* * CDROM IOCTL structures *******************************************************/ @@ -295,6 +297,23 @@ struct cdrom_generic_command }; }; +/* This struct is used by CDROM_TIMED_MEDIA_CHANGE */ +struct cdrom_timed_media_change_info { + __s64 last_media_change; /* Timestamp of the last detected media + * change in ms. May be set by caller, + * updated upon successful return of + * ioctl. + */ + __u64 media_flags; /* Flags returned by ioctl to indicate + * media status. + */ +}; +#define MEDIA_CHANGED_FLAG 0x1 /* Last detected media change was more + * recent than last_media_change set by + * caller. + */ +/* other bits of media_flags available for future use */ + /* * A CD-ROM physical sector size is 2048, 2052, 2056, 2324, 2332, 2336, * 2340, or 2352 bytes long. diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 235e5b2facaa..71a5df8d2689 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -44,6 +44,31 @@ FUTEX_PRIVATE_FLAG) /* + * Flags to specify the bit length of the futex word for futex2 syscalls. + * Currently, only 32 is supported. + */ +#define FUTEX_32 2 + +/* + * Max numbers of elements in a futex_waitv array + */ +#define FUTEX_WAITV_MAX 128 + +/** + * struct futex_waitv - A waiter for vectorized wait + * @val: Expected value at uaddr + * @uaddr: User address to wait on + * @flags: Flags for this waiter + * @__reserved: Reserved member to preserve data alignment. Should be 0. + */ +struct futex_waitv { + __u64 val; + __u64 uaddr; + __u32 flags; + __u32 __reserved; +}; + +/* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. */ diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index 6135d92e0d47..daf82a230c0e 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -26,7 +26,7 @@ #ifndef _UAPI_HYPERV_H #define _UAPI_HYPERV_H -#include <linux/uuid.h> +#include <linux/types.h> /* * Framework version for util services. diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b270a07b285e..c45b5e9a9387 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -158,6 +158,7 @@ enum { #define IORING_TIMEOUT_BOOTTIME (1U << 2) #define IORING_TIMEOUT_REALTIME (1U << 3) #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) +#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index 52b54d13f385..6acd4ccafbf7 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -10,6 +10,7 @@ #define __UAPI_MCTP_H #include <linux/types.h> +#include <linux/socket.h> typedef __u8 mctp_eid_t; @@ -18,11 +19,13 @@ struct mctp_addr { }; struct sockaddr_mctp { - unsigned short int smctp_family; - int smctp_network; + __kernel_sa_family_t smctp_family; + __u16 __smctp_pad0; + unsigned int smctp_network; struct mctp_addr smctp_addr; __u8 smctp_type; __u8 smctp_tag; + __u8 __smctp_pad1; }; #define MCTP_NET_ANY 0x0 diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index f92880a15645..bd8860eeb291 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1141,6 +1141,21 @@ enum perf_event_type { */ PERF_RECORD_TEXT_POKE = 20, + /* + * Data written to the AUX area by hardware due to aux_output, may need + * to be matched to the event by an architecture-specific hardware ID. + * This records the hardware ID, but requires sample_id to provide the + * event ID. e.g. Intel PT uses this record to disambiguate PEBS-via-PT + * records from multiple events. + * + * struct { + * struct perf_event_header header; + * u64 hw_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1210,14 +1225,16 @@ union perf_mem_data_src { mem_remote:1, /* remote */ mem_snoopx:2, /* snoop mode, ext */ mem_blk:3, /* access blocked */ - mem_rsvd:21; + mem_hops:3, /* hop level */ + mem_rsvd:18; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:21, + __u64 mem_rsvd:18, + mem_hops:3, /* hop level */ mem_blk:3, /* access blocked */ mem_snoopx:2, /* snoop mode, ext */ mem_remote:1, /* remote */ @@ -1241,7 +1258,13 @@ union perf_mem_data_src { #define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ #define PERF_MEM_OP_SHIFT 0 -/* memory hierarchy (memory level, hit or miss) */ +/* + * PERF_MEM_LVL_* namespace being depricated to some extent in the + * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. + * Supporting this namespace inorder to not break defined ABIs. + * + * memory hierarchy (memory level, hit or miss) + */ #define PERF_MEM_LVL_NA 0x01 /* not available */ #define PERF_MEM_LVL_HIT 0x02 /* hit level */ #define PERF_MEM_LVL_MISS 0x04 /* miss level */ @@ -1307,6 +1330,11 @@ union perf_mem_data_src { #define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ #define PERF_MEM_BLK_SHIFT 40 +/* hop level */ +#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ +/* 2-7 available */ +#define PERF_MEM_HOPS_SHIFT 43 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index b96c1ea7166d..eda0426ec4c2 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -213,13 +213,13 @@ enum { XFRM_MSG_GETSPDINFO, #define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO + XFRM_MSG_MAPPING, +#define XFRM_MSG_MAPPING XFRM_MSG_MAPPING + XFRM_MSG_SETDEFAULT, #define XFRM_MSG_SETDEFAULT XFRM_MSG_SETDEFAULT XFRM_MSG_GETDEFAULT, #define XFRM_MSG_GETDEFAULT XFRM_MSG_GETDEFAULT - - XFRM_MSG_MAPPING, -#define XFRM_MSG_MAPPING XFRM_MSG_MAPPING __XFRM_MSG_MAX }; #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1) @@ -514,9 +514,12 @@ struct xfrm_user_offload { #define XFRM_OFFLOAD_INBOUND 2 struct xfrm_userpolicy_default { -#define XFRM_USERPOLICY_DIRMASK_MAX (sizeof(__u8) * 8) - __u8 dirmask; - __u8 action; +#define XFRM_USERPOLICY_UNSPEC 0 +#define XFRM_USERPOLICY_BLOCK 1 +#define XFRM_USERPOLICY_ACCEPT 2 + __u8 in; + __u8 fwd; + __u8 out; }; #ifndef __KERNEL__ diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 7cc2a0f3f2f5..d13bb8c1b450 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -917,7 +917,6 @@ struct hl_wait_cs_in { #define HL_WAIT_CS_STATUS_BUSY 1 #define HL_WAIT_CS_STATUS_TIMEDOUT 2 #define HL_WAIT_CS_STATUS_ABORTED 3 -#define HL_WAIT_CS_STATUS_INTERRUPTED 4 #define HL_WAIT_CS_STATUS_FLAG_GONE 0x1 #define HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD 0x2 @@ -1286,7 +1285,8 @@ struct hl_debug_args { * EIO - The CS was aborted (usually because the device was reset) * ENODEV - The device wants to do hard-reset (so user need to close FD) * - * The driver also returns a custom define inside the IOCTL which can be: + * The driver also returns a custom define in case the IOCTL call returned 0. + * The define can be one of the following: * * HL_WAIT_CS_STATUS_COMPLETED - The CS has been completed successfully (0) * HL_WAIT_CS_STATUS_BUSY - The CS is still executing (0) @@ -1294,8 +1294,6 @@ struct hl_debug_args { * (ETIMEDOUT) * HL_WAIT_CS_STATUS_ABORTED - The CS was aborted, usually because the * device was reset (EIO) - * HL_WAIT_CS_STATUS_INTERRUPTED - Waiting for the CS was interrupted (EINTR) - * */ #define HL_IOCTL_WAIT_CS \ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index db28e79b77ee..a3584a357f35 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -52,12 +52,12 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); #if defined(CONFIG_XEN_PV) int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, - unsigned int domid, bool no_translate, struct page **pages); + unsigned int domid, bool no_translate); #else static inline int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, unsigned int domid, - bool no_translate, struct page **pages) + bool no_translate) { BUG(); return 0; @@ -134,7 +134,7 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, */ BUG_ON(err_ptr == NULL); return xen_remap_pfn(vma, addr, gfn, nr, err_ptr, prot, domid, - false, pages); + false); } /* @@ -146,7 +146,6 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, * @err_ptr: Returns per-MFN error status. * @prot: page protection mask * @domid: Domain owning the pages - * @pages: Array of pages if this domain has an auto-translated physmap * * @mfn and @err_ptr may point to the same buffer, the MFNs will be * overwritten by the error codes after they are mapped. @@ -157,14 +156,13 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, static inline int xen_remap_domain_mfn_array(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *mfn, int nr, int *err_ptr, - pgprot_t prot, unsigned int domid, - struct page **pages) + pgprot_t prot, unsigned int domid) { if (xen_feature(XENFEAT_auto_translated_physmap)) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid, - true, pages); + true); } /* xen_remap_domain_gfn_range() - map a range of foreign frames @@ -188,8 +186,7 @@ static inline int xen_remap_domain_gfn_range(struct vm_area_struct *vma, if (xen_feature(XENFEAT_auto_translated_physmap)) return -EOPNOTSUPP; - return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false, - pages); + return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false); } int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, diff --git a/init/main.c b/init/main.c index 81a79a77db46..4162d7f3179f 100644 --- a/init/main.c +++ b/init/main.c @@ -83,7 +83,6 @@ #include <linux/ptrace.h> #include <linux/pti.h> #include <linux/blkdev.h> -#include <linux/elevator.h> #include <linux/sched/clock.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> @@ -382,6 +381,7 @@ static char * __init xbc_make_cmdline(const char *key) ret = xbc_snprint_cmdline(new_cmdline, len + 1, root); if (ret < 0 || ret > len) { pr_err("Failed to print extra kernel cmdline.\n"); + memblock_free_ptr(new_cmdline, len + 1); return NULL; } diff --git a/kernel/Makefile b/kernel/Makefile index 4df609be42d0..3f6ab5d5041b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -59,7 +59,7 @@ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ -obj-$(CONFIG_FUTEX) += futex.o +obj-$(CONFIG_FUTEX) += futex/ obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) diff --git a/kernel/acct.c b/kernel/acct.c index 23a7ab8e6cbc..3df53cf1dcd5 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -60,7 +60,6 @@ #include <linux/sched/cputime.h> #include <asm/div64.h> -#include <linux/blkdev.h> /* sector_div */ #include <linux/pid_namespace.h> #include <linux/fs_pin.h> diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8dd73a64f921..b1cb1dbf7417 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -657,7 +657,7 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; case AUDIT_SADDR_FAM: - if (ctx->sockaddr) + if (ctx && ctx->sockaddr) result = audit_comparator(ctx->sockaddr->ss_family, f->op, f->val); break; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cebd4fb06d19..447def540544 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1072,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); + spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d6b7dfdd8066..6e3ae90ad107 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -524,6 +524,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; +long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) @@ -817,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void) static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ - bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); + bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2, PAGE_SIZE), LONG_MAX); return 0; } @@ -1821,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + bool ret; + if (fp->kprobe_override) return false; - if (!array->aux->type) { + spin_lock(&array->aux->owner.lock); + + if (!array->aux->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->type = fp->type; - array->aux->jited = fp->jited; - return true; + array->aux->owner.type = fp->type; + array->aux->owner.jited = fp->jited; + ret = true; + } else { + ret = array->aux->owner.type == fp->type && + array->aux->owner.jited == fp->jited; } - - return array->aux->type == fp->type && - array->aux->jited == fp->jited; + spin_unlock(&array->aux->owner.lock); + return ret; } static int bpf_check_tail_call(const struct bpf_prog *fp) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 09a3fd97d329..6e75bbee39f0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map) static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { - u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + u64 elem_size = sizeof(struct stack_map_bucket) + + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e50c0bfdb7d..1cad6979a0d0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -543,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - type = array->aux->type; - jited = array->aux->jited; + spin_lock(&array->aux->owner.lock); + type = array->aux->owner.type; + jited = array->aux->owner.jited; + spin_unlock(&array->aux->owner.lock); } seq_printf(m, @@ -1337,12 +1339,11 @@ int generic_map_update_batch(struct bpf_map *map, void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 value_size, cp, max_count; - int ufd = attr->map_fd; + int ufd = attr->batch.map_fd; void *key, *value; struct fd f; int err = 0; - f = fdget(ufd); if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -1367,6 +1368,7 @@ int generic_map_update_batch(struct bpf_map *map, return -ENOMEM; } + f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */ for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, @@ -1386,6 +1388,7 @@ int generic_map_update_batch(struct bpf_map *map, kvfree(value); kvfree(key); + fdput(f); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e76b55917905..de006552be8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13319,7 +13319,7 @@ BTF_SET_START(btf_non_sleepable_error_inject) /* Three functions below can be called from sleepable and non-sleepable context. * Assume non-sleepable from bpf safety point of view. */ -BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, __filemap_add_folio) BTF_ID(func, should_fail_alloc_page) BTF_ID(func, should_failslab) BTF_SET_END(btf_non_sleepable_error_inject) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 570b0c97392a..ea08f01d0111 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2187,8 +2187,10 @@ static void cgroup_kill_sb(struct super_block *sb) * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + cgroup_bpf_offline(&root->cgrp); percpu_ref_kill(&root->cgrp.self.refcnt); + } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index df1ccf4558f8..2a9695ccb65f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -311,17 +311,19 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global locks guarding cpuset structures - cpuset_mutex and + * There are two global locks guarding cpuset structures - cpuset_rwsem and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. + * comment. The cpuset code uses only cpuset_rwsem write lock. Other + * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to + * prevent change to cpuset structures. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_mutex. While it is performing these checks, various + * just holding cpuset_rwsem. While it is performing these checks, various * callback routines can briefly acquire callback_lock to query cpusets. * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. @@ -393,7 +395,7 @@ static inline bool is_in_v2_mode(void) * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) @@ -435,7 +437,7 @@ out_unlock: * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -447,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -468,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. + * are only set if the other's are set. Call holding cpuset_rwsem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -577,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_mutex held. + * cpuset_rwsem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -700,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_mutex held. */ +/* Must be called with cpuset_rwsem held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -726,7 +728,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_mutex held. + * Must be called with cpuset_rwsem held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -1005,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes cpus_read_lock(). + * Call with cpuset_rwsem held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1078,7 +1080,7 @@ void rebuild_sched_domains(void) * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_cpumask(struct cpuset *cs) @@ -1347,7 +1349,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) { @@ -1704,12 +1706,12 @@ static void *cpuset_being_rebound; * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_mutex */ + static nodemask_t newmems; /* protected by cpuset_rwsem */ struct css_task_iter it; struct task_struct *task; @@ -1722,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_mutex, we know that no other rebind effort + * the global cpuset_rwsem, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1768,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -1821,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_lock during call. + * Call with cpuset_rwsem held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1911,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays + * function is called with cpuset_rwsem held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) @@ -1931,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1980,7 +1982,7 @@ out: * cs: the cpuset to update * new_prs: new partition root state * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { @@ -2167,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; -/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; @@ -2219,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) } /* - * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -2227,7 +2229,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_mutex */ + /* static buf protected by cpuset_rwsem */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; @@ -2417,7 +2419,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy + * grabbing cpuset_rwsem anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); @@ -3672,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_rwsem, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, diff --git a/kernel/cred.c b/kernel/cred.c index f784e08c2fbd..1ae0b4948a5a 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -225,8 +225,6 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif - new->ucounts = get_ucounts(&init_ucounts); - if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -501,7 +499,7 @@ int commit_creds(struct cred *new) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); - if (new->user != old->user) + if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(old, -2); @@ -669,7 +667,7 @@ int set_cred_ucounts(struct cred *new) { struct task_struct *task = current; const struct cred *old = task->real_cred; - struct ucounts *old_ucounts = new->ucounts; + struct ucounts *new_ucounts, *old_ucounts = new->ucounts; if (new->user == old->user && new->user_ns == old->user_ns) return 0; @@ -681,9 +679,10 @@ int set_cred_ucounts(struct cred *new) if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) return 0; - if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid))) + if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid))) return -EAGAIN; + new->ucounts = new_ucounts; if (old_ucounts) put_ucounts(old_ucounts); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 95445bd6eb72..7a14ca29c377 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -552,7 +552,7 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. */ -static void add_dma_entry(struct dma_debug_entry *entry) +static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) { struct hash_bucket *bucket; unsigned long flags; @@ -566,7 +566,7 @@ static void add_dma_entry(struct dma_debug_entry *entry) if (rc == -ENOMEM) { pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; - } else if (rc == -EEXIST) { + } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { err_printk(entry->dev, entry, "cacheline tracking EEXIST, overlapping mappings aren't supported\n"); } @@ -1191,7 +1191,8 @@ void debug_dma_map_single(struct device *dev, const void *addr, EXPORT_SYMBOL(debug_dma_map_single); void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr) + size_t size, int direction, dma_addr_t dma_addr, + unsigned long attrs) { struct dma_debug_entry *entry; @@ -1222,7 +1223,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, check_for_illegal_area(dev, addr, size); } - add_dma_entry(entry); + add_dma_entry(entry, attrs); } void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) @@ -1280,7 +1281,8 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, } void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction) + int nents, int mapped_ents, int direction, + unsigned long attrs) { struct dma_debug_entry *entry; struct scatterlist *s; @@ -1289,6 +1291,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, if (unlikely(dma_debug_disabled())) return; + for_each_sg(sg, s, nents, i) { + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) + check_for_illegal_area(dev, sg_virt(s), s->length); + } + for_each_sg(sg, s, mapped_ents, i) { entry = dma_entry_alloc(); if (!entry) @@ -1304,15 +1312,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; - check_for_stack(dev, sg_page(s), s->offset); - - if (!PageHighMem(sg_page(s))) { - check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); - } - check_sg_segment(dev, s); - add_dma_entry(entry); + add_dma_entry(entry, attrs); } } @@ -1368,7 +1370,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, } void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt) + dma_addr_t dma_addr, void *virt, + unsigned long attrs) { struct dma_debug_entry *entry; @@ -1398,7 +1401,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, else entry->pfn = page_to_pfn(virt_to_page(virt)); - add_dma_entry(entry); + add_dma_entry(entry, attrs); } void debug_dma_free_coherent(struct device *dev, size_t size, @@ -1429,7 +1432,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size, } void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr) + int direction, dma_addr_t dma_addr, + unsigned long attrs) { struct dma_debug_entry *entry; @@ -1449,7 +1453,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - add_dma_entry(entry); + add_dma_entry(entry, attrs); } void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index 83643b3010b2..f525197d3cae 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -11,26 +11,30 @@ #ifdef CONFIG_DMA_API_DEBUG extern void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, - int direction, dma_addr_t dma_addr); + int direction, dma_addr_t dma_addr, + unsigned long attrs); extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction); + int nents, int mapped_ents, int direction, + unsigned long attrs); extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir); extern void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt); + dma_addr_t dma_addr, void *virt, + unsigned long attrs); extern void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr); extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, int direction, - dma_addr_t dma_addr); + dma_addr_t dma_addr, + unsigned long attrs); extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, size_t size, int direction); @@ -53,7 +57,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev, #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, - int direction, dma_addr_t dma_addr) + int direction, dma_addr_t dma_addr, + unsigned long attrs) { } @@ -63,7 +68,8 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, } static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction) + int nents, int mapped_ents, int direction, + unsigned long attrs) { } @@ -74,7 +80,8 @@ static inline void debug_dma_unmap_sg(struct device *dev, } static inline void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt) + dma_addr_t dma_addr, void *virt, + unsigned long attrs) { } @@ -85,7 +92,8 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size, static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, int direction, - dma_addr_t dma_addr) + dma_addr_t dma_addr, + unsigned long attrs) { } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 06fec5547e7c..8349a9f2c345 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -156,7 +156,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr); + debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; } @@ -195,7 +195,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, ents = ops->map_sg(dev, sg, nents, dir, attrs); if (ents > 0) - debug_dma_map_sg(dev, sg, nents, ents, dir); + debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && ents != -EIO)) return -EIO; @@ -249,12 +249,12 @@ EXPORT_SYMBOL(dma_map_sg_attrs); * Returns 0 on success or a negative error code on error. The following * error codes are supported with the given meaning: * - * -EINVAL - An invalid argument, unaligned access or other error - * in usage. Will not succeed if retried. - * -ENOMEM - Insufficient resources (like memory or IOVA space) to - * complete the mapping. Should succeed if retried later. - * -EIO - Legacy error code with an unknown meaning. eg. this is - * returned if a lower level call returned DMA_MAPPING_ERROR. + * -EINVAL An invalid argument, unaligned access or other error + * in usage. Will not succeed if retried. + * -ENOMEM Insufficient resources (like memory or IOVA space) to + * complete the mapping. Should succeed if retried later. + * -EIO Legacy error code with an unknown meaning. eg. this is + * returned if a lower level call returned DMA_MAPPING_ERROR. */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) @@ -305,7 +305,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, else if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - debug_dma_map_resource(dev, phys_addr, size, dir, addr); + debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); return addr; } EXPORT_SYMBOL(dma_map_resource); @@ -510,7 +510,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, else return NULL; - debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); return cpu_addr; } EXPORT_SYMBOL(dma_alloc_attrs); @@ -566,7 +566,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); if (page) - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); @@ -644,7 +644,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (sgt) { sgt->nents = 1; - debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir); + debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); } return sgt; } diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 3c022e33c109..8591c180b52b 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,10 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) -endif - obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_UPROBES) += uprobes.o - diff --git a/kernel/events/core.c b/kernel/events/core.c index f23ca260307f..a89b01b7e59a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9099,6 +9099,36 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } +void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 hw_id; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.hw_id = hw_id; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index af24dc3febbe..6357c3580d07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); + err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, + GFP_KERNEL); if (err) return err; } diff --git a/kernel/exit.c b/kernel/exit.c index 63851320ae73..50f1692c732d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,6 @@ #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> -#include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> #include <linux/fs_struct.h> diff --git a/kernel/fork.c b/kernel/fork.c index 0e4251dc5436..8e9feeef555e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,7 +76,6 @@ #include <linux/taskstats_kern.h> #include <linux/random.h> #include <linux/tty.h> -#include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> #include <linux/perf_event.h> diff --git a/kernel/futex.c b/kernel/futex.c deleted file mode 100644 index c15ad276fd15..000000000000 --- a/kernel/futex.c +++ /dev/null @@ -1,4272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Fast Userspace Mutexes (which I call "Futexes!"). - * (C) Rusty Russell, IBM 2002 - * - * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar - * (C) Copyright 2003 Red Hat Inc, All Rights Reserved - * - * Removed page pinning, fix privately mapped COW pages and other cleanups - * (C) Copyright 2003, 2004 Jamie Lokier - * - * Robust futex support started by Ingo Molnar - * (C) Copyright 2006 Red Hat Inc, All Rights Reserved - * Thanks to Thomas Gleixner for suggestions, analysis and fixes. - * - * PI-futex support started by Ingo Molnar and Thomas Gleixner - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * PRIVATE futexes by Eric Dumazet - * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> - * - * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> - * Copyright (C) IBM Corporation, 2009 - * Thanks to Thomas Gleixner for conceptual design and careful reviews. - * - * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly - * enough at me, Linus for the original (flawed) idea, Matthew - * Kirkwood for proof-of-concept implementation. - * - * "The futexes are also cursed." - * "But they come in a choice of three flavours!" - */ -#include <linux/compat.h> -#include <linux/jhash.h> -#include <linux/pagemap.h> -#include <linux/syscalls.h> -#include <linux/freezer.h> -#include <linux/memblock.h> -#include <linux/fault-inject.h> -#include <linux/time_namespace.h> - -#include <asm/futex.h> - -#include "locking/rtmutex_common.h" - -/* - * READ this before attempting to hack on futexes! - * - * Basic futex operation and ordering guarantees - * ============================================= - * - * The waiter reads the futex value in user space and calls - * futex_wait(). This function computes the hash bucket and acquires - * the hash bucket lock. After that it reads the futex user space value - * again and verifies that the data has not changed. If it has not changed - * it enqueues itself into the hash bucket, releases the hash bucket lock - * and schedules. - * - * The waker side modifies the user space value of the futex and calls - * futex_wake(). This function computes the hash bucket and acquires the - * hash bucket lock. Then it looks for waiters on that futex in the hash - * bucket and wakes them. - * - * In futex wake up scenarios where no tasks are blocked on a futex, taking - * the hb spinlock can be avoided and simply return. In order for this - * optimization to work, ordering guarantees must exist so that the waiter - * being added to the list is acknowledged when the list is concurrently being - * checked by the waker, avoiding scenarios like the following: - * - * CPU 0 CPU 1 - * val = *futex; - * sys_futex(WAIT, futex, val); - * futex_wait(futex, val); - * uval = *futex; - * *futex = newval; - * sys_futex(WAKE, futex); - * futex_wake(futex); - * if (queue_empty()) - * return; - * if (uval == val) - * lock(hash_bucket(futex)); - * queue(); - * unlock(hash_bucket(futex)); - * schedule(); - * - * This would cause the waiter on CPU 0 to wait forever because it - * missed the transition of the user space value from val to newval - * and the waker did not find the waiter in the hash bucket queue. - * - * The correct serialization ensures that a waiter either observes - * the changed user space value before blocking or is woken by a - * concurrent waker: - * - * CPU 0 CPU 1 - * val = *futex; - * sys_futex(WAIT, futex, val); - * futex_wait(futex, val); - * - * waiters++; (a) - * smp_mb(); (A) <-- paired with -. - * | - * lock(hash_bucket(futex)); | - * | - * uval = *futex; | - * | *futex = newval; - * | sys_futex(WAKE, futex); - * | futex_wake(futex); - * | - * `--------> smp_mb(); (B) - * if (uval == val) - * queue(); - * unlock(hash_bucket(futex)); - * schedule(); if (waiters) - * lock(hash_bucket(futex)); - * else wake_waiters(futex); - * waiters--; (b) unlock(hash_bucket(futex)); - * - * Where (A) orders the waiters increment and the futex value read through - * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read (see hb_waiters_pending()). - * - * This yields the following case (where X:=waiters, Y:=futex): - * - * X = Y = 0 - * - * w[X]=1 w[Y]=1 - * MB MB - * r[Y]=y r[X]=x - * - * Which guarantees that x==0 && y==0 is impossible; which translates back into - * the guarantee that we cannot both miss the futex variable change and the - * enqueue. - * - * Note that a new waiter is accounted for in (a) even when it is possible that - * the wait call can return error, in which case we backtrack from it in (b). - * Refer to the comment in queue_lock(). - * - * Similarly, in order to account for waiters being requeued on another - * address we always increment the waiters for the destination bucket before - * acquiring the lock. It then decrements them again after releasing it - - * the code that actually moves the futex(es) between hash buckets (requeue_futex) - * will do the additional required waiter count housekeeping. This is done for - * double_lock_hb() and double_unlock_hb(), respectively. - */ - -#ifdef CONFIG_HAVE_FUTEX_CMPXCHG -#define futex_cmpxchg_enabled 1 -#else -static int __read_mostly futex_cmpxchg_enabled; -#endif - -/* - * Futex flags used to encode options to functions and preserve them across - * restarts. - */ -#ifdef CONFIG_MMU -# define FLAGS_SHARED 0x01 -#else -/* - * NOMMU does not have per process address space. Let the compiler optimize - * code away. - */ -# define FLAGS_SHARED 0x00 -#endif -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 - -/* - * Priority Inheritance state: - */ -struct futex_pi_state { - /* - * list of 'owned' pi_state instances - these have to be - * cleaned up in do_exit() if the task exits prematurely: - */ - struct list_head list; - - /* - * The PI object: - */ - struct rt_mutex_base pi_mutex; - - struct task_struct *owner; - refcount_t refcount; - - union futex_key key; -} __randomize_layout; - -/** - * struct futex_q - The hashed futex queue entry, one per waiting task - * @list: priority-sorted list of tasks waiting on this futex - * @task: the task waiting on the futex - * @lock_ptr: the hash bucket lock - * @key: the key the futex is hashed on - * @pi_state: optional priority inheritance state - * @rt_waiter: rt_waiter storage for use with requeue_pi - * @requeue_pi_key: the requeue_pi target futex key - * @bitset: bitset for the optional bitmasked wakeup - * @requeue_state: State field for futex_requeue_pi() - * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) - * - * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so - * we can wake only the relevant ones (hashed queues may be shared). - * - * A futex_q has a woken state, just like tasks have TASK_RUNNING. - * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. - * The order of wakeup is always to make the first condition true, then - * the second. - * - * PI futexes are typically woken before they are removed from the hash list via - * the rt_mutex code. See unqueue_me_pi(). - */ -struct futex_q { - struct plist_node list; - - struct task_struct *task; - spinlock_t *lock_ptr; - union futex_key key; - struct futex_pi_state *pi_state; - struct rt_mutex_waiter *rt_waiter; - union futex_key *requeue_pi_key; - u32 bitset; - atomic_t requeue_state; -#ifdef CONFIG_PREEMPT_RT - struct rcuwait requeue_wait; -#endif -} __randomize_layout; - -/* - * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an - * underlying rtmutex. The task which is about to be requeued could have - * just woken up (timeout, signal). After the wake up the task has to - * acquire hash bucket lock, which is held by the requeue code. As a task - * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking - * and the hash bucket lock blocking would collide and corrupt state. - * - * On !PREEMPT_RT this is not a problem and everything could be serialized - * on hash bucket lock, but aside of having the benefit of common code, - * this allows to avoid doing the requeue when the task is already on the - * way out and taking the hash bucket lock of the original uaddr1 when the - * requeue has been completed. - * - * The following state transitions are valid: - * - * On the waiter side: - * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE - * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT - * - * On the requeue side: - * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS - * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED - * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) - * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED - * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) - * - * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this - * signals that the waiter is already on the way out. It also means that - * the waiter is still on the 'wait' futex, i.e. uaddr1. - * - * The waiter side signals early wakeup to the requeue side either through - * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending - * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately - * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, - * which means the wakeup is interleaving with a requeue in progress it has - * to wait for the requeue side to change the state. Either to DONE/LOCKED - * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex - * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by - * the requeue side when the requeue attempt failed via deadlock detection - * and therefore the waiter q is still on the uaddr1 futex. - */ -enum { - Q_REQUEUE_PI_NONE = 0, - Q_REQUEUE_PI_IGNORE, - Q_REQUEUE_PI_IN_PROGRESS, - Q_REQUEUE_PI_WAIT, - Q_REQUEUE_PI_DONE, - Q_REQUEUE_PI_LOCKED, -}; - -static const struct futex_q futex_q_init = { - /* list gets initialized in queue_me()*/ - .key = FUTEX_KEY_INIT, - .bitset = FUTEX_BITSET_MATCH_ANY, - .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), -}; - -/* - * Hash buckets are shared by all the futex_keys that hash to the same - * location. Each key may have multiple futex_q structures, one for each task - * waiting on a futex. - */ -struct futex_hash_bucket { - atomic_t waiters; - spinlock_t lock; - struct plist_head chain; -} ____cacheline_aligned_in_smp; - -/* - * The base of the bucket array and its size are always used together - * (after initialization only in hash_futex()), so ensure that they - * reside in the same cacheline. - */ -static struct { - struct futex_hash_bucket *queues; - unsigned long hashsize; -} __futex_data __read_mostly __aligned(2*sizeof(long)); -#define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) - - -/* - * Fault injections for futexes. - */ -#ifdef CONFIG_FAIL_FUTEX - -static struct { - struct fault_attr attr; - - bool ignore_private; -} fail_futex = { - .attr = FAULT_ATTR_INITIALIZER, - .ignore_private = false, -}; - -static int __init setup_fail_futex(char *str) -{ - return setup_fault_attr(&fail_futex.attr, str); -} -__setup("fail_futex=", setup_fail_futex); - -static bool should_fail_futex(bool fshared) -{ - if (fail_futex.ignore_private && !fshared) - return false; - - return should_fail(&fail_futex.attr, 1); -} - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - -static int __init fail_futex_debugfs(void) -{ - umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - struct dentry *dir; - - dir = fault_create_debugfs_attr("fail_futex", NULL, - &fail_futex.attr); - if (IS_ERR(dir)) - return PTR_ERR(dir); - - debugfs_create_bool("ignore-private", mode, dir, - &fail_futex.ignore_private); - return 0; -} - -late_initcall(fail_futex_debugfs); - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - -#else -static inline bool should_fail_futex(bool fshared) -{ - return false; -} -#endif /* CONFIG_FAIL_FUTEX */ - -#ifdef CONFIG_COMPAT -static void compat_exit_robust_list(struct task_struct *curr); -#endif - -/* - * Reflects a new waiter being added to the waitqueue. - */ -static inline void hb_waiters_inc(struct futex_hash_bucket *hb) -{ -#ifdef CONFIG_SMP - atomic_inc(&hb->waiters); - /* - * Full barrier (A), see the ordering comment above. - */ - smp_mb__after_atomic(); -#endif -} - -/* - * Reflects a waiter being removed from the waitqueue by wakeup - * paths. - */ -static inline void hb_waiters_dec(struct futex_hash_bucket *hb) -{ -#ifdef CONFIG_SMP - atomic_dec(&hb->waiters); -#endif -} - -static inline int hb_waiters_pending(struct futex_hash_bucket *hb) -{ -#ifdef CONFIG_SMP - /* - * Full barrier (B), see the ordering comment above. - */ - smp_mb(); - return atomic_read(&hb->waiters); -#else - return 1; -#endif -} - -/** - * hash_futex - Return the hash bucket in the global hash - * @key: Pointer to the futex key for which the hash is calculated - * - * We hash on the keys returned from get_futex_key (see below) and return the - * corresponding hash bucket in the global hash. - */ -static struct futex_hash_bucket *hash_futex(union futex_key *key) -{ - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, - key->both.offset); - - return &futex_queues[hash & (futex_hashsize - 1)]; -} - - -/** - * match_futex - Check whether two futex keys are equal - * @key1: Pointer to key1 - * @key2: Pointer to key2 - * - * Return 1 if two futex_keys are equal, 0 otherwise. - */ -static inline int match_futex(union futex_key *key1, union futex_key *key2) -{ - return (key1 && key2 - && key1->both.word == key2->both.word - && key1->both.ptr == key2->both.ptr - && key1->both.offset == key2->both.offset); -} - -enum futex_access { - FUTEX_READ, - FUTEX_WRITE -}; - -/** - * futex_setup_timer - set up the sleeping hrtimer. - * @time: ptr to the given timeout value - * @timeout: the hrtimer_sleeper structure to be set up - * @flags: futex flags - * @range_ns: optional range in ns - * - * Return: Initialized hrtimer_sleeper structure or NULL if no timeout - * value given - */ -static inline struct hrtimer_sleeper * -futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, - int flags, u64 range_ns) -{ - if (!time) - return NULL; - - hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - /* - * If range_ns is 0, calling hrtimer_set_expires_range_ns() is - * effectively the same as calling hrtimer_set_expires(). - */ - hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); - - return timeout; -} - -/* - * Generate a machine wide unique identifier for this inode. - * - * This relies on u64 not wrapping in the life-time of the machine; which with - * 1ns resolution means almost 585 years. - * - * This further relies on the fact that a well formed program will not unmap - * the file while it has a (shared) futex waiting on it. This mapping will have - * a file reference which pins the mount and inode. - * - * If for some reason an inode gets evicted and read back in again, it will get - * a new sequence number and will _NOT_ match, even though it is the exact same - * file. - * - * It is important that match_futex() will never have a false-positive, esp. - * for PI futexes that can mess up the state. The above argues that false-negatives - * are only possible for malformed programs. - */ -static u64 get_inode_sequence_number(struct inode *inode) -{ - static atomic64_t i_seq; - u64 old; - - /* Does the inode already have a sequence number? */ - old = atomic64_read(&inode->i_sequence); - if (likely(old)) - return old; - - for (;;) { - u64 new = atomic64_add_return(1, &i_seq); - if (WARN_ON_ONCE(!new)) - continue; - - old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); - if (old) - return old; - return new; - } -} - -/** - * get_futex_key() - Get parameters which are the keys for a futex - * @uaddr: virtual address of the futex - * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED - * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: FUTEX_READ, - * FUTEX_WRITE) - * - * Return: a negative error code or 0 - * - * The key words are stored in @key on success. - * - * For shared mappings (when @fshared), the key is: - * - * ( inode->i_sequence, page->index, offset_within_page ) - * - * [ also see get_inode_sequence_number() ] - * - * For private mappings (or when !@fshared), the key is: - * - * ( current->mm, address, 0 ) - * - * This allows (cross process, where applicable) identification of the futex - * without keeping the page pinned for the duration of the FUTEX_WAIT. - * - * lock_page() might sleep, the caller should not hold a spinlock. - */ -static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, - enum futex_access rw) -{ - unsigned long address = (unsigned long)uaddr; - struct mm_struct *mm = current->mm; - struct page *page, *tail; - struct address_space *mapping; - int err, ro = 0; - - /* - * The futex address must be "naturally" aligned. - */ - key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) - return -EINVAL; - address -= key->both.offset; - - if (unlikely(!access_ok(uaddr, sizeof(u32)))) - return -EFAULT; - - if (unlikely(should_fail_futex(fshared))) - return -EFAULT; - - /* - * PROCESS_PRIVATE futexes are fast. - * As the mm cannot disappear under us and the 'key' only needs - * virtual address, we dont even have to find the underlying vma. - * Note : We do have to check 'uaddr' is a valid user address, - * but access_ok() should be faster than find_vma() - */ - if (!fshared) { - key->private.mm = mm; - key->private.address = address; - return 0; - } - -again: - /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); - /* - * If write access is not required (eg. FUTEX_WAIT), try - * and get read-only access. - */ - if (err == -EFAULT && rw == FUTEX_READ) { - err = get_user_pages_fast(address, 1, 0, &page); - ro = 1; - } - if (err < 0) - return err; - else - err = 0; - - /* - * The treatment of mapping from this point on is critical. The page - * lock protects many things but in this context the page lock - * stabilizes mapping, prevents inode freeing in the shared - * file-backed region case and guards against movement to swap cache. - * - * Strictly speaking the page lock is not needed in all cases being - * considered here and page lock forces unnecessarily serialization - * From this point on, mapping will be re-verified if necessary and - * page lock will be acquired only if it is unavoidable - * - * Mapping checks require the head page for any compound page so the - * head page and mapping is looked up now. For anonymous pages, it - * does not matter if the page splits in the future as the key is - * based on the address. For filesystem-backed pages, the tail is - * required as the index of the page determines the key. For - * base pages, there is no tail page and tail == page. - */ - tail = page; - page = compound_head(page); - mapping = READ_ONCE(page->mapping); - - /* - * If page->mapping is NULL, then it cannot be a PageAnon - * page; but it might be the ZERO_PAGE or in the gate area or - * in a special mapping (all cases which we are happy to fail); - * or it may have been a good file page when get_user_pages_fast - * found it, but truncated or holepunched or subjected to - * invalidate_complete_page2 before we got the page lock (also - * cases which we are happy to fail). And we hold a reference, - * so refcount care in invalidate_complete_page's remove_mapping - * prevents drop_caches from setting mapping to NULL beneath us. - * - * The case we do have to guard against is when memory pressure made - * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page->mapping. - */ - if (unlikely(!mapping)) { - int shmem_swizzled; - - /* - * Page lock is required to identify which special case above - * applies. If this is really a shmem page then the page lock - * will prevent unexpected transitions. - */ - lock_page(page); - shmem_swizzled = PageSwapCache(page) || page->mapping; - unlock_page(page); - put_page(page); - - if (shmem_swizzled) - goto again; - - return -EFAULT; - } - - /* - * Private mappings are handled in a simple way. - * - * If the futex key is stored on an anonymous page, then the associated - * object is the mm which is implicitly pinned by the calling process. - * - * NOTE: When userspace waits on a MAP_SHARED mapping, even if - * it's a read-only handle, it's expected that futexes attach to - * the object not the particular process. - */ - if (PageAnon(page)) { - /* - * A RO anonymous page will never change and thus doesn't make - * sense for futex operations. - */ - if (unlikely(should_fail_futex(true)) || ro) { - err = -EFAULT; - goto out; - } - - key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ - key->private.mm = mm; - key->private.address = address; - - } else { - struct inode *inode; - - /* - * The associated futex object in this case is the inode and - * the page->mapping must be traversed. Ordinarily this should - * be stabilised under page lock but it's not strictly - * necessary in this case as we just want to pin the inode, not - * update the radix tree or anything like that. - * - * The RCU read lock is taken as the inode is finally freed - * under RCU. If the mapping still matches expectations then the - * mapping->host can be safely accessed as being a valid inode. - */ - rcu_read_lock(); - - if (READ_ONCE(page->mapping) != mapping) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - inode = READ_ONCE(mapping->host); - if (!inode) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = page_to_pgoff(tail); - rcu_read_unlock(); - } - -out: - put_page(page); - return err; -} - -/** - * fault_in_user_writeable() - Fault in user address and verify RW access - * @uaddr: pointer to faulting user space address - * - * Slow path to fixup the fault we just took in the atomic write - * access to @uaddr. - * - * We have no generic implementation of a non-destructive write to the - * user address. We know that we faulted in the atomic pagefault - * disabled section so we can as well avoid the #PF overhead by - * calling get_user_pages() right away. - */ -static int fault_in_user_writeable(u32 __user *uaddr) -{ - struct mm_struct *mm = current->mm; - int ret; - - mmap_read_lock(mm); - ret = fixup_user_fault(mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE, NULL); - mmap_read_unlock(mm); - - return ret < 0 ? ret : 0; -} - -/** - * futex_top_waiter() - Return the highest priority waiter on a futex - * @hb: the hash bucket the futex_q's reside in - * @key: the futex key (to distinguish it from other futex futex_q's) - * - * Must be called with the hb lock held. - */ -static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, - union futex_key *key) -{ - struct futex_q *this; - - plist_for_each_entry(this, &hb->chain, list) { - if (match_futex(&this->key, key)) - return this; - } - return NULL; -} - -static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, - u32 uval, u32 newval) -{ - int ret; - - pagefault_disable(); - ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); - pagefault_enable(); - - return ret; -} - -static int get_futex_value_locked(u32 *dest, u32 __user *from) -{ - int ret; - - pagefault_disable(); - ret = __get_user(*dest, from); - pagefault_enable(); - - return ret ? -EFAULT : 0; -} - - -/* - * PI code: - */ -static int refill_pi_state_cache(void) -{ - struct futex_pi_state *pi_state; - - if (likely(current->pi_state_cache)) - return 0; - - pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); - - if (!pi_state) - return -ENOMEM; - - INIT_LIST_HEAD(&pi_state->list); - /* pi_mutex gets initialized later */ - pi_state->owner = NULL; - refcount_set(&pi_state->refcount, 1); - pi_state->key = FUTEX_KEY_INIT; - - current->pi_state_cache = pi_state; - - return 0; -} - -static struct futex_pi_state *alloc_pi_state(void) -{ - struct futex_pi_state *pi_state = current->pi_state_cache; - - WARN_ON(!pi_state); - current->pi_state_cache = NULL; - - return pi_state; -} - -static void pi_state_update_owner(struct futex_pi_state *pi_state, - struct task_struct *new_owner) -{ - struct task_struct *old_owner = pi_state->owner; - - lockdep_assert_held(&pi_state->pi_mutex.wait_lock); - - if (old_owner) { - raw_spin_lock(&old_owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&old_owner->pi_lock); - } - - if (new_owner) { - raw_spin_lock(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock(&new_owner->pi_lock); - } -} - -static void get_pi_state(struct futex_pi_state *pi_state) -{ - WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); -} - -/* - * Drops a reference to the pi_state object and frees or caches it - * when the last reference is gone. - */ -static void put_pi_state(struct futex_pi_state *pi_state) -{ - if (!pi_state) - return; - - if (!refcount_dec_and_test(&pi_state->refcount)) - return; - - /* - * If pi_state->owner is NULL, the owner is most probably dying - * and has cleaned up the pi_state already - */ - if (pi_state->owner) { - unsigned long flags; - - raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); - pi_state_update_owner(pi_state, NULL); - rt_mutex_proxy_unlock(&pi_state->pi_mutex); - raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); - } - - if (current->pi_state_cache) { - kfree(pi_state); - } else { - /* - * pi_state->list is already empty. - * clear pi_state->owner. - * refcount is at 0 - put it back to 1. - */ - pi_state->owner = NULL; - refcount_set(&pi_state->refcount, 1); - current->pi_state_cache = pi_state; - } -} - -#ifdef CONFIG_FUTEX_PI - -/* - * This task is holding PI mutexes at exit time => bad. - * Kernel cleans up PI-state, but userspace is likely hosed. - * (Robust-futex cleanup is separate and might save the day for userspace.) - */ -static void exit_pi_state_list(struct task_struct *curr) -{ - struct list_head *next, *head = &curr->pi_state_list; - struct futex_pi_state *pi_state; - struct futex_hash_bucket *hb; - union futex_key key = FUTEX_KEY_INIT; - - if (!futex_cmpxchg_enabled) - return; - /* - * We are a ZOMBIE and nobody can enqueue itself on - * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselves: - */ - raw_spin_lock_irq(&curr->pi_lock); - while (!list_empty(head)) { - next = head->next; - pi_state = list_entry(next, struct futex_pi_state, list); - key = pi_state->key; - hb = hash_futex(&key); - - /* - * We can race against put_pi_state() removing itself from the - * list (a waiter going away). put_pi_state() will first - * decrement the reference count and then modify the list, so - * its possible to see the list entry but fail this reference - * acquire. - * - * In that case; drop the locks to let put_pi_state() make - * progress and retry the loop. - */ - if (!refcount_inc_not_zero(&pi_state->refcount)) { - raw_spin_unlock_irq(&curr->pi_lock); - cpu_relax(); - raw_spin_lock_irq(&curr->pi_lock); - continue; - } - raw_spin_unlock_irq(&curr->pi_lock); - - spin_lock(&hb->lock); - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - raw_spin_lock(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - /* retain curr->pi_lock for the loop invariant */ - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); - put_pi_state(pi_state); - continue; - } - - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - - raw_spin_unlock(&curr->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); - - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - - raw_spin_lock_irq(&curr->pi_lock); - } - raw_spin_unlock_irq(&curr->pi_lock); -} -#else -static inline void exit_pi_state_list(struct task_struct *curr) { } -#endif - -/* - * We need to check the following states: - * - * Waiter | pi_state | pi->owner | uTID | uODIED | ? - * - * [1] NULL | --- | --- | 0 | 0/1 | Valid - * [2] NULL | --- | --- | >0 | 0/1 | Valid - * - * [3] Found | NULL | -- | Any | 0/1 | Invalid - * - * [4] Found | Found | NULL | 0 | 1 | Valid - * [5] Found | Found | NULL | >0 | 1 | Invalid - * - * [6] Found | Found | task | 0 | 1 | Valid - * - * [7] Found | Found | NULL | Any | 0 | Invalid - * - * [8] Found | Found | task | ==taskTID | 0/1 | Valid - * [9] Found | Found | task | 0 | 0 | Invalid - * [10] Found | Found | task | !=taskTID | 0/1 | Invalid - * - * [1] Indicates that the kernel can acquire the futex atomically. We - * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. - * - * [2] Valid, if TID does not belong to a kernel thread. If no matching - * thread is found then it indicates that the owner TID has died. - * - * [3] Invalid. The waiter is queued on a non PI futex - * - * [4] Valid state after exit_robust_list(), which sets the user space - * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. - * - * [5] The user space value got manipulated between exit_robust_list() - * and exit_pi_state_list() - * - * [6] Valid state after exit_pi_state_list() which sets the new owner in - * the pi_state but cannot access the user space value. - * - * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. - * - * [8] Owner and user space value match - * - * [9] There is no transient state which sets the user space TID to 0 - * except exit_robust_list(), but this is indicated by the - * FUTEX_OWNER_DIED bit. See [4] - * - * [10] There is no transient state which leaves owner and user space - * TID out of sync. Except one error case where the kernel is denied - * write access to the user address, see fixup_pi_state_owner(). - * - * - * Serialization and lifetime rules: - * - * hb->lock: - * - * hb -> futex_q, relation - * futex_q -> pi_state, relation - * - * (cannot be raw because hb can contain arbitrary amount - * of futex_q's) - * - * pi_mutex->wait_lock: - * - * {uval, pi_state} - * - * (and pi_mutex 'obviously') - * - * p->pi_lock: - * - * p->pi_state_list -> pi_state->list, relation - * pi_mutex->owner -> pi_state->owner, relation - * - * pi_state->refcount: - * - * pi_state lifetime - * - * - * Lock order: - * - * hb->lock - * pi_mutex->wait_lock - * p->pi_lock - * - */ - -/* - * Validate that the existing waiter has a pi_state and sanity check - * the pi_state against the user space value. If correct, attach to - * it. - */ -static int attach_to_pi_state(u32 __user *uaddr, u32 uval, - struct futex_pi_state *pi_state, - struct futex_pi_state **ps) -{ - pid_t pid = uval & FUTEX_TID_MASK; - u32 uval2; - int ret; - - /* - * Userspace might have messed up non-PI and PI futexes [3] - */ - if (unlikely(!pi_state)) - return -EINVAL; - - /* - * We get here with hb->lock held, and having found a - * futex_top_waiter(). This means that futex_lock_pi() of said futex_q - * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), - * which in turn means that futex_lock_pi() still has a reference on - * our pi_state. - * - * The waiter holding a reference on @pi_state also protects against - * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() - * and futex_wait_requeue_pi() as it cannot go to 0 and consequently - * free pi_state before we can take a reference ourselves. - */ - WARN_ON(!refcount_read(&pi_state->refcount)); - - /* - * Now that we have a pi_state, we can acquire wait_lock - * and do the state validation. - */ - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - - /* - * Since {uval, pi_state} is serialized by wait_lock, and our current - * uval was read without holding it, it can have changed. Verify it - * still is what we expect it to be, otherwise retry the entire - * operation. - */ - if (get_futex_value_locked(&uval2, uaddr)) - goto out_efault; - - if (uval != uval2) - goto out_eagain; - - /* - * Handle the owner died case: - */ - if (uval & FUTEX_OWNER_DIED) { - /* - * exit_pi_state_list sets owner to NULL and wakes the - * topmost waiter. The task which acquires the - * pi_state->rt_mutex will fixup owner. - */ - if (!pi_state->owner) { - /* - * No pi state owner, but the user space TID - * is not 0. Inconsistent state. [5] - */ - if (pid) - goto out_einval; - /* - * Take a ref on the state and return success. [4] - */ - goto out_attach; - } - - /* - * If TID is 0, then either the dying owner has not - * yet executed exit_pi_state_list() or some waiter - * acquired the rtmutex in the pi state, but did not - * yet fixup the TID in user space. - * - * Take a ref on the state and return success. [6] - */ - if (!pid) - goto out_attach; - } else { - /* - * If the owner died bit is not set, then the pi_state - * must have an owner. [7] - */ - if (!pi_state->owner) - goto out_einval; - } - - /* - * Bail out if user space manipulated the futex value. If pi - * state exists then the owner TID must be the same as the - * user space TID. [9/10] - */ - if (pid != task_pid_vnr(pi_state->owner)) - goto out_einval; - -out_attach: - get_pi_state(pi_state); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - *ps = pi_state; - return 0; - -out_einval: - ret = -EINVAL; - goto out_error; - -out_eagain: - ret = -EAGAIN; - goto out_error; - -out_efault: - ret = -EFAULT; - goto out_error; - -out_error: - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; -} - -/** - * wait_for_owner_exiting - Block until the owner has exited - * @ret: owner's current futex lock status - * @exiting: Pointer to the exiting task - * - * Caller must hold a refcount on @exiting. - */ -static void wait_for_owner_exiting(int ret, struct task_struct *exiting) -{ - if (ret != -EBUSY) { - WARN_ON_ONCE(exiting); - return; - } - - if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) - return; - - mutex_lock(&exiting->futex_exit_mutex); - /* - * No point in doing state checking here. If the waiter got here - * while the task was in exec()->exec_futex_release() then it can - * have any FUTEX_STATE_* value when the waiter has acquired the - * mutex. OK, if running, EXITING or DEAD if it reached exit() - * already. Highly unlikely and not a problem. Just one more round - * through the futex maze. - */ - mutex_unlock(&exiting->futex_exit_mutex); - - put_task_struct(exiting); -} - -static int handle_exit_race(u32 __user *uaddr, u32 uval, - struct task_struct *tsk) -{ - u32 uval2; - - /* - * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the - * caller that the alleged owner is busy. - */ - if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) - return -EBUSY; - - /* - * Reread the user space value to handle the following situation: - * - * CPU0 CPU1 - * - * sys_exit() sys_futex() - * do_exit() futex_lock_pi() - * futex_lock_pi_atomic() - * exit_signals(tsk) No waiters: - * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID - * mm_release(tsk) Set waiter bit - * exit_robust_list(tsk) { *uaddr = 0x80000PID; - * Set owner died attach_to_pi_owner() { - * *uaddr = 0xC0000000; tsk = get_task(PID); - * } if (!tsk->flags & PF_EXITING) { - * ... attach(); - * tsk->futex_state = } else { - * FUTEX_STATE_DEAD; if (tsk->futex_state != - * FUTEX_STATE_DEAD) - * return -EAGAIN; - * return -ESRCH; <--- FAIL - * } - * - * Returning ESRCH unconditionally is wrong here because the - * user space value has been changed by the exiting task. - * - * The same logic applies to the case where the exiting task is - * already gone. - */ - if (get_futex_value_locked(&uval2, uaddr)) - return -EFAULT; - - /* If the user space value has changed, try again. */ - if (uval2 != uval) - return -EAGAIN; - - /* - * The exiting task did not have a robust list, the robust list was - * corrupted or the user space value in *uaddr is simply bogus. - * Give up and tell user space. - */ - return -ESRCH; -} - -static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, - struct futex_pi_state **ps) -{ - /* - * No existing pi state. First waiter. [2] - * - * This creates pi_state, we have hb->lock held, this means nothing can - * observe this state, wait_lock is irrelevant. - */ - struct futex_pi_state *pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make @p - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - /* - * Assignment without holding pi_state->pi_mutex.wait_lock is safe - * because there is no concurrency as the object is not published yet. - */ - pi_state->owner = p; - - *ps = pi_state; -} -/* - * Lookup the task for the TID provided from user space and attach to - * it after doing proper sanity checks. - */ -static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, - struct futex_pi_state **ps, - struct task_struct **exiting) -{ - pid_t pid = uval & FUTEX_TID_MASK; - struct task_struct *p; - - /* - * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 [1] - * - * The !pid check is paranoid. None of the call sites should end up - * with pid == 0, but better safe than sorry. Let the caller retry - */ - if (!pid) - return -EAGAIN; - p = find_get_task_by_vpid(pid); - if (!p) - return handle_exit_race(uaddr, uval, NULL); - - if (unlikely(p->flags & PF_KTHREAD)) { - put_task_struct(p); - return -EPERM; - } - - /* - * We need to look at the task state to figure out, whether the - * task is exiting. To protect against the change of the task state - * in futex_exit_release(), we do this protected by p->pi_lock: - */ - raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->futex_state != FUTEX_STATE_OK)) { - /* - * The task is on the way out. When the futex state is - * FUTEX_STATE_DEAD, we know that the task has finished - * the cleanup: - */ - int ret = handle_exit_race(uaddr, uval, p); - - raw_spin_unlock_irq(&p->pi_lock); - /* - * If the owner task is between FUTEX_STATE_EXITING and - * FUTEX_STATE_DEAD then store the task pointer and keep - * the reference on the task struct. The calling code will - * drop all locks, wait for the task to reach - * FUTEX_STATE_DEAD and then drop the refcount. This is - * required to prevent a live lock when the current task - * preempted the exiting task between the two states. - */ - if (ret == -EBUSY) - *exiting = p; - else - put_task_struct(p); - return ret; - } - - __attach_to_pi_owner(p, key, ps); - raw_spin_unlock_irq(&p->pi_lock); - - put_task_struct(p); - - return 0; -} - -static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) -{ - int err; - u32 curval; - - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); - if (unlikely(err)) - return err; - - /* If user space value changed, let the caller retry */ - return curval != uval ? -EAGAIN : 0; -} - -/** - * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex - * @uaddr: the pi futex user address - * @hb: the pi futex hash bucket - * @key: the futex key associated with uaddr and hb - * @ps: the pi_state pointer where we store the result of the - * lookup - * @task: the task to perform the atomic lock work for. This will - * be "current" except in the case of requeue pi. - * @exiting: Pointer to store the task pointer of the owner task - * which is in the middle of exiting - * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) - * - * Return: - * - 0 - ready to wait; - * - 1 - acquired the lock; - * - <0 - error - * - * The hb->lock must be held by the caller. - * - * @exiting is only set when the return value is -EBUSY. If so, this holds - * a refcount on the exiting task on return and the caller needs to drop it - * after waiting for the exit to complete. - */ -static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, - union futex_key *key, - struct futex_pi_state **ps, - struct task_struct *task, - struct task_struct **exiting, - int set_waiters) -{ - u32 uval, newval, vpid = task_pid_vnr(task); - struct futex_q *top_waiter; - int ret; - - /* - * Read the user space value first so we can validate a few - * things before proceeding further. - */ - if (get_futex_value_locked(&uval, uaddr)) - return -EFAULT; - - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - /* - * Detect deadlocks. - */ - if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) - return -EDEADLK; - - if ((unlikely(should_fail_futex(true)))) - return -EDEADLK; - - /* - * Lookup existing state first. If it exists, try to attach to - * its pi_state. - */ - top_waiter = futex_top_waiter(hb, key); - if (top_waiter) - return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); - - /* - * No waiter and user TID is 0. We are here because the - * waiters or the owner died bit is set or called from - * requeue_cmp_pi or for whatever reason something took the - * syscall. - */ - if (!(uval & FUTEX_TID_MASK)) { - /* - * We take over the futex. No other waiters and the user space - * TID is 0. We preserve the owner died bit. - */ - newval = uval & FUTEX_OWNER_DIED; - newval |= vpid; - - /* The futex requeue_pi code can enforce the waiters bit */ - if (set_waiters) - newval |= FUTEX_WAITERS; - - ret = lock_pi_update_atomic(uaddr, uval, newval); - if (ret) - return ret; - - /* - * If the waiter bit was requested the caller also needs PI - * state attached to the new owner of the user space futex. - * - * @task is guaranteed to be alive and it cannot be exiting - * because it is either sleeping or waiting in - * futex_requeue_pi_wakeup_sync(). - * - * No need to do the full attach_to_pi_owner() exercise - * because @task is known and valid. - */ - if (set_waiters) { - raw_spin_lock_irq(&task->pi_lock); - __attach_to_pi_owner(task, key, ps); - raw_spin_unlock_irq(&task->pi_lock); - } - return 1; - } - - /* - * First waiter. Set the waiters bit before attaching ourself to - * the owner. If owner tries to unlock, it will be forced into - * the kernel and blocked on hb->lock. - */ - newval = uval | FUTEX_WAITERS; - ret = lock_pi_update_atomic(uaddr, uval, newval); - if (ret) - return ret; - /* - * If the update of the user space value succeeded, we try to - * attach to the owner. If that fails, no harm done, we only - * set the FUTEX_WAITERS bit in the user space variable. - */ - return attach_to_pi_owner(uaddr, newval, key, ps, exiting); -} - -/** - * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket - * @q: The futex_q to unqueue - * - * The q->lock_ptr must not be NULL and must be held by the caller. - */ -static void __unqueue_futex(struct futex_q *q) -{ - struct futex_hash_bucket *hb; - - if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) - return; - lockdep_assert_held(q->lock_ptr); - - hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); - plist_del(&q->list, &hb->chain); - hb_waiters_dec(hb); -} - -/* - * The hash bucket lock must be held when this is called. - * Afterwards, the futex_q must not be accessed. Callers - * must ensure to later call wake_up_q() for the actual - * wakeups to occur. - */ -static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) -{ - struct task_struct *p = q->task; - - if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) - return; - - get_task_struct(p); - __unqueue_futex(q); - /* - * The waiting task can free the futex_q as soon as q->lock_ptr = NULL - * is written, without taking any locks. This is possible in the event - * of a spurious wakeup, for example. A memory barrier is required here - * to prevent the following store to lock_ptr from getting ahead of the - * plist_del in __unqueue_futex(). - */ - smp_store_release(&q->lock_ptr, NULL); - - /* - * Queue the task for later wakeup for after we've released - * the hb->lock. - */ - wake_q_add_safe(wake_q, p); -} - -/* - * Caller must hold a reference on @pi_state. - */ -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) -{ - struct rt_mutex_waiter *top_waiter; - struct task_struct *new_owner; - bool postunlock = false; - DEFINE_RT_WAKE_Q(wqh); - u32 curval, newval; - int ret = 0; - - top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); - if (WARN_ON_ONCE(!top_waiter)) { - /* - * As per the comment in futex_unlock_pi() this should not happen. - * - * When this happens, give up our locks and try again, giving - * the futex_lock_pi() instance time to complete, either by - * waiting on the rtmutex or removing itself from the futex - * queue. - */ - ret = -EAGAIN; - goto out_unlock; - } - - new_owner = top_waiter->task; - - /* - * We pass it to the next owner. The WAITERS bit is always kept - * enabled while there is PI state around. We cleanup the owner - * died bit, because we are the owner. - */ - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - - if (unlikely(should_fail_futex(true))) { - ret = -EFAULT; - goto out_unlock; - } - - ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); - if (!ret && (curval != uval)) { - /* - * If a unconditional UNLOCK_PI operation (user space did not - * try the TID->0 transition) raced with a waiter setting the - * FUTEX_WAITERS flag between get_user() and locking the hash - * bucket lock, retry the operation. - */ - if ((FUTEX_TID_MASK & curval) == uval) - ret = -EAGAIN; - else - ret = -EINVAL; - } - - if (!ret) { - /* - * This is a point of no return; once we modified the uval - * there is no going back and subsequent operations must - * not fail. - */ - pi_state_update_owner(pi_state, new_owner); - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); - } - -out_unlock: - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - - if (postunlock) - rt_mutex_postunlock(&wqh); - - return ret; -} - -/* - * Express the locking dependencies for lockdep: - */ -static inline void -double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - if (hb1 <= hb2) { - spin_lock(&hb1->lock); - if (hb1 < hb2) - spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); - } else { /* hb1 > hb2 */ - spin_lock(&hb2->lock); - spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); - } -} - -static inline void -double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); -} - -/* - * Wake up waiters matching bitset queued on this futex (uaddr). - */ -static int -futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) -{ - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - union futex_key key = FUTEX_KEY_INIT; - int ret; - DEFINE_WAKE_Q(wake_q); - - if (!bitset) - return -EINVAL; - - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - - hb = hash_futex(&key); - - /* Make sure we really have tasks to wakeup */ - if (!hb_waiters_pending(hb)) - return ret; - - spin_lock(&hb->lock); - - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (match_futex (&this->key, &key)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - break; - } - - /* Check if one of the bits is set in both bitsets */ - if (!(this->bitset & bitset)) - continue; - - mark_wake_futex(&wake_q, this); - if (++ret >= nr_wake) - break; - } - } - - spin_unlock(&hb->lock); - wake_up_q(&wake_q); - return ret; -} - -static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) -{ - unsigned int op = (encoded_op & 0x70000000) >> 28; - unsigned int cmp = (encoded_op & 0x0f000000) >> 24; - int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); - int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); - int oldval, ret; - - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { - if (oparg < 0 || oparg > 31) { - char comm[sizeof(current->comm)]; - /* - * kill this print and return -EINVAL when userspace - * is sane again - */ - pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", - get_task_comm(comm, current), oparg); - oparg &= 31; - } - oparg = 1 << oparg; - } - - pagefault_disable(); - ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); - pagefault_enable(); - if (ret) - return ret; - - switch (cmp) { - case FUTEX_OP_CMP_EQ: - return oldval == cmparg; - case FUTEX_OP_CMP_NE: - return oldval != cmparg; - case FUTEX_OP_CMP_LT: - return oldval < cmparg; - case FUTEX_OP_CMP_GE: - return oldval >= cmparg; - case FUTEX_OP_CMP_LE: - return oldval <= cmparg; - case FUTEX_OP_CMP_GT: - return oldval > cmparg; - default: - return -ENOSYS; - } -} - -/* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: - */ -static int -futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - int nr_wake, int nr_wake2, int op) -{ - union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb1, *hb2; - struct futex_q *this, *next; - int ret, op_ret; - DEFINE_WAKE_Q(wake_q); - -retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); - if (unlikely(ret != 0)) - return ret; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - -retry_private: - double_lock_hb(hb1, hb2); - op_ret = futex_atomic_op_inuser(op, uaddr2); - if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); - - if (!IS_ENABLED(CONFIG_MMU) || - unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { - /* - * we don't get EFAULT from MMU faults if we don't have - * an MMU, but we might get them from range checking - */ - ret = op_ret; - return ret; - } - - if (op_ret == -EFAULT) { - ret = fault_in_user_writeable(uaddr2); - if (ret) - return ret; - } - - cond_resched(); - if (!(flags & FLAGS_SHARED)) - goto retry_private; - goto retry; - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (match_futex (&this->key, &key1)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - goto out_unlock; - } - mark_wake_futex(&wake_q, this); - if (++ret >= nr_wake) - break; - } - } - - if (op_ret > 0) { - op_ret = 0; - plist_for_each_entry_safe(this, next, &hb2->chain, list) { - if (match_futex (&this->key, &key2)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - goto out_unlock; - } - mark_wake_futex(&wake_q, this); - if (++op_ret >= nr_wake2) - break; - } - } - ret += op_ret; - } - -out_unlock: - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); - return ret; -} - -/** - * requeue_futex() - Requeue a futex_q from one hb to another - * @q: the futex_q to requeue - * @hb1: the source hash_bucket - * @hb2: the target hash_bucket - * @key2: the new key for the requeued futex_q - */ -static inline -void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, union futex_key *key2) -{ - - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(&hb1->chain != &hb2->chain)) { - plist_del(&q->list, &hb1->chain); - hb_waiters_dec(hb1); - hb_waiters_inc(hb2); - plist_add(&q->list, &hb2->chain); - q->lock_ptr = &hb2->lock; - } - q->key = *key2; -} - -static inline bool futex_requeue_pi_prepare(struct futex_q *q, - struct futex_pi_state *pi_state) -{ - int old, new; - - /* - * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has - * already set Q_REQUEUE_PI_IGNORE to signal that requeue should - * ignore the waiter. - */ - old = atomic_read_acquire(&q->requeue_state); - do { - if (old == Q_REQUEUE_PI_IGNORE) - return false; - - /* - * futex_proxy_trylock_atomic() might have set it to - * IN_PROGRESS and a interleaved early wake to WAIT. - * - * It was considered to have an extra state for that - * trylock, but that would just add more conditionals - * all over the place for a dubious value. - */ - if (old != Q_REQUEUE_PI_NONE) - break; - - new = Q_REQUEUE_PI_IN_PROGRESS; - } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); - - q->pi_state = pi_state; - return true; -} - -static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) -{ - int old, new; - - old = atomic_read_acquire(&q->requeue_state); - do { - if (old == Q_REQUEUE_PI_IGNORE) - return; - - if (locked >= 0) { - /* Requeue succeeded. Set DONE or LOCKED */ - WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && - old != Q_REQUEUE_PI_WAIT); - new = Q_REQUEUE_PI_DONE + locked; - } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { - /* Deadlock, no early wakeup interleave */ - new = Q_REQUEUE_PI_NONE; - } else { - /* Deadlock, early wakeup interleave. */ - WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); - new = Q_REQUEUE_PI_IGNORE; - } - } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); - -#ifdef CONFIG_PREEMPT_RT - /* If the waiter interleaved with the requeue let it know */ - if (unlikely(old == Q_REQUEUE_PI_WAIT)) - rcuwait_wake_up(&q->requeue_wait); -#endif -} - -static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) -{ - int old, new; - - old = atomic_read_acquire(&q->requeue_state); - do { - /* Is requeue done already? */ - if (old >= Q_REQUEUE_PI_DONE) - return old; - - /* - * If not done, then tell the requeue code to either ignore - * the waiter or to wake it up once the requeue is done. - */ - new = Q_REQUEUE_PI_WAIT; - if (old == Q_REQUEUE_PI_NONE) - new = Q_REQUEUE_PI_IGNORE; - } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); - - /* If the requeue was in progress, wait for it to complete */ - if (old == Q_REQUEUE_PI_IN_PROGRESS) { -#ifdef CONFIG_PREEMPT_RT - rcuwait_wait_event(&q->requeue_wait, - atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, - TASK_UNINTERRUPTIBLE); -#else - (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); -#endif - } - - /* - * Requeue is now either prohibited or complete. Reread state - * because during the wait above it might have changed. Nothing - * will modify q->requeue_state after this point. - */ - return atomic_read(&q->requeue_state); -} - -/** - * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * @q: the futex_q - * @key: the key of the requeue target futex - * @hb: the hash_bucket of the requeue target futex - * - * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. - * - * 1) Set @q::key to the requeue target futex key so the waiter can detect - * the wakeup on the right futex. - * - * 2) Dequeue @q from the hash bucket. - * - * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock - * acquisition. - * - * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that - * the waiter has to fixup the pi state. - * - * 5) Complete the requeue state so the waiter can make progress. After - * this point the waiter task can return from the syscall immediately in - * case that the pi state does not have to be fixed up. - * - * 6) Wake the waiter task. - * - * Must be called with both q->lock_ptr and hb->lock held. - */ -static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, - struct futex_hash_bucket *hb) -{ - q->key = *key; - - __unqueue_futex(q); - - WARN_ON(!q->rt_waiter); - q->rt_waiter = NULL; - - q->lock_ptr = &hb->lock; - - /* Signal locked state to the waiter */ - futex_requeue_pi_complete(q, 1); - wake_up_state(q->task, TASK_NORMAL); -} - -/** - * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter - * @pifutex: the user address of the to futex - * @hb1: the from futex hash bucket, must be locked by the caller - * @hb2: the to futex hash bucket, must be locked by the caller - * @key1: the from futex key - * @key2: the to futex key - * @ps: address to store the pi_state pointer - * @exiting: Pointer to store the task pointer of the owner task - * which is in the middle of exiting - * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) - * - * Try and get the lock on behalf of the top waiter if we can do it atomically. - * Wake the top waiter if we succeed. If the caller specified set_waiters, - * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. - * hb1 and hb2 must be held by the caller. - * - * @exiting is only set when the return value is -EBUSY. If so, this holds - * a refcount on the exiting task on return and the caller needs to drop it - * after waiting for the exit to complete. - * - * Return: - * - 0 - failed to acquire the lock atomically; - * - >0 - acquired the lock, return value is vpid of the top_waiter - * - <0 - error - */ -static int -futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, union futex_key *key1, - union futex_key *key2, struct futex_pi_state **ps, - struct task_struct **exiting, int set_waiters) -{ - struct futex_q *top_waiter = NULL; - u32 curval; - int ret; - - if (get_futex_value_locked(&curval, pifutex)) - return -EFAULT; - - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - /* - * Find the top_waiter and determine if there are additional waiters. - * If the caller intends to requeue more than 1 waiter to pifutex, - * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, - * as we have means to handle the possible fault. If not, don't set - * the bit unnecessarily as it will force the subsequent unlock to enter - * the kernel. - */ - top_waiter = futex_top_waiter(hb1, key1); - - /* There are no waiters, nothing for us to do. */ - if (!top_waiter) - return 0; - - /* - * Ensure that this is a waiter sitting in futex_wait_requeue_pi() - * and waiting on the 'waitqueue' futex which is always !PI. - */ - if (!top_waiter->rt_waiter || top_waiter->pi_state) - return -EINVAL; - - /* Ensure we requeue to the expected futex. */ - if (!match_futex(top_waiter->requeue_pi_key, key2)) - return -EINVAL; - - /* Ensure that this does not race against an early wakeup */ - if (!futex_requeue_pi_prepare(top_waiter, NULL)) - return -EAGAIN; - - /* - * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit - * in the contended case or if @set_waiters is true. - * - * In the contended case PI state is attached to the lock owner. If - * the user space lock can be acquired then PI state is attached to - * the new owner (@top_waiter->task) when @set_waiters is true. - */ - ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, - exiting, set_waiters); - if (ret == 1) { - /* - * Lock was acquired in user space and PI state was - * attached to @top_waiter->task. That means state is fully - * consistent and the waiter can return to user space - * immediately after the wakeup. - */ - requeue_pi_wake_futex(top_waiter, key2, hb2); - } else if (ret < 0) { - /* Rewind top_waiter::requeue_state */ - futex_requeue_pi_complete(top_waiter, ret); - } else { - /* - * futex_lock_pi_atomic() did not acquire the user space - * futex, but managed to establish the proxy lock and pi - * state. top_waiter::requeue_state cannot be fixed up here - * because the waiter is not enqueued on the rtmutex - * yet. This is handled at the callsite depending on the - * result of rt_mutex_start_proxy_lock() which is - * guaranteed to be reached with this function returning 0. - */ - } - return ret; -} - -/** - * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * @uaddr1: source futex user address - * @flags: futex flags (FLAGS_SHARED, etc.) - * @uaddr2: target futex user address - * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * @nr_requeue: number of waiters to requeue (0-INT_MAX) - * @cmpval: @uaddr1 expected value (or %NULL) - * @requeue_pi: if we are attempting to requeue from a non-pi futex to a - * pi futex (pi to pi requeue is not supported) - * - * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire - * uaddr2 atomically on behalf of the top waiter. - * - * Return: - * - >=0 - on success, the number of tasks requeued or woken; - * - <0 - on error - */ -static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - u32 __user *uaddr2, int nr_wake, int nr_requeue, - u32 *cmpval, int requeue_pi) -{ - union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int task_count = 0, ret; - struct futex_pi_state *pi_state = NULL; - struct futex_hash_bucket *hb1, *hb2; - struct futex_q *this, *next; - DEFINE_WAKE_Q(wake_q); - - if (nr_wake < 0 || nr_requeue < 0) - return -EINVAL; - - /* - * When PI not supported: return -ENOSYS if requeue_pi is true, - * consequently the compiler knows requeue_pi is always false past - * this point which will optimize away all the conditional code - * further down. - */ - if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) - return -ENOSYS; - - if (requeue_pi) { - /* - * Requeue PI only works on two distinct uaddrs. This - * check is only valid for private futexes. See below. - */ - if (uaddr1 == uaddr2) - return -EINVAL; - - /* - * futex_requeue() allows the caller to define the number - * of waiters to wake up via the @nr_wake argument. With - * REQUEUE_PI, waking up more than one waiter is creating - * more problems than it solves. Waking up a waiter makes - * only sense if the PI futex @uaddr2 is uncontended as - * this allows the requeue code to acquire the futex - * @uaddr2 before waking the waiter. The waiter can then - * return to user space without further action. A secondary - * wakeup would just make the futex_wait_requeue_pi() - * handling more complex, because that code would have to - * look up pi_state and do more or less all the handling - * which the requeue code has to do for the to be requeued - * waiters. So restrict the number of waiters to wake to - * one, and only wake it up when the PI futex is - * uncontended. Otherwise requeue it and let the unlock of - * the PI futex handle the wakeup. - * - * All REQUEUE_PI users, e.g. pthread_cond_signal() and - * pthread_cond_broadcast() must use nr_wake=1. - */ - if (nr_wake != 1) - return -EINVAL; - - /* - * requeue_pi requires a pi_state, try to allocate it now - * without any locks in case it fails. - */ - if (refill_pi_state_cache()) - return -ENOMEM; - } - -retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? FUTEX_WRITE : FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (requeue_pi && match_futex(&key1, &key2)) - return -EINVAL; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - -retry_private: - hb_waiters_inc(hb2); - double_lock_hb(hb1, hb2); - - if (likely(cmpval != NULL)) { - u32 curval; - - ret = get_futex_value_locked(&curval, uaddr1); - - if (unlikely(ret)) { - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); - - ret = get_user(curval, uaddr1); - if (ret) - return ret; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - goto retry; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } - - if (requeue_pi) { - struct task_struct *exiting = NULL; - - /* - * Attempt to acquire uaddr2 and wake the top waiter. If we - * intend to requeue waiters, force setting the FUTEX_WAITERS - * bit. We force this here where we are able to easily handle - * faults rather in the requeue loop below. - * - * Updates topwaiter::requeue_state if a top waiter exists. - */ - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, - &exiting, nr_requeue); - - /* - * At this point the top_waiter has either taken uaddr2 or - * is waiting on it. In both cases pi_state has been - * established and an initial refcount on it. In case of an - * error there's nothing. - * - * The top waiter's requeue_state is up to date: - * - * - If the lock was acquired atomically (ret == 1), then - * the state is Q_REQUEUE_PI_LOCKED. - * - * The top waiter has been dequeued and woken up and can - * return to user space immediately. The kernel/user - * space state is consistent. In case that there must be - * more waiters requeued the WAITERS bit in the user - * space futex is set so the top waiter task has to go - * into the syscall slowpath to unlock the futex. This - * will block until this requeue operation has been - * completed and the hash bucket locks have been - * dropped. - * - * - If the trylock failed with an error (ret < 0) then - * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing - * happened", or Q_REQUEUE_PI_IGNORE when there was an - * interleaved early wakeup. - * - * - If the trylock did not succeed (ret == 0) then the - * state is either Q_REQUEUE_PI_IN_PROGRESS or - * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. - * This will be cleaned up in the loop below, which - * cannot fail because futex_proxy_trylock_atomic() did - * the same sanity checks for requeue_pi as the loop - * below does. - */ - switch (ret) { - case 0: - /* We hold a reference on the pi state. */ - break; - - case 1: - /* - * futex_proxy_trylock_atomic() acquired the user space - * futex. Adjust task_count. - */ - task_count++; - ret = 0; - break; - - /* - * If the above failed, then pi_state is NULL and - * waiter::requeue_state is correct. - */ - case -EFAULT: - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; - return ret; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Owner is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); - /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. - */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock; - } - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (task_count - nr_wake >= nr_requeue) - break; - - if (!match_futex(&this->key, &key1)) - continue; - - /* - * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always - * be paired with each other and no other futex ops. - * - * We should never be requeueing a futex_q with a pi_state, - * which is awaiting a futex_unlock_pi(). - */ - if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter) || - this->pi_state) { - ret = -EINVAL; - break; - } - - /* Plain futexes just wake or requeue and are done */ - if (!requeue_pi) { - if (++task_count <= nr_wake) - mark_wake_futex(&wake_q, this); - else - requeue_futex(this, hb1, hb2, &key2); - continue; - } - - /* Ensure we requeue to the expected futex for requeue_pi. */ - if (!match_futex(this->requeue_pi_key, &key2)) { - ret = -EINVAL; - break; - } - - /* - * Requeue nr_requeue waiters and possibly one more in the case - * of requeue_pi if we couldn't acquire the lock atomically. - * - * Prepare the waiter to take the rt_mutex. Take a refcount - * on the pi_state and store the pointer in the futex_q - * object of the waiter. - */ - get_pi_state(pi_state); - - /* Don't requeue when the waiter is already on the way out. */ - if (!futex_requeue_pi_prepare(this, pi_state)) { - /* - * Early woken waiter signaled that it is on the - * way out. Drop the pi_state reference and try the - * next waiter. @this->pi_state is still NULL. - */ - put_pi_state(pi_state); - continue; - } - - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - - if (ret == 1) { - /* - * We got the lock. We do neither drop the refcount - * on pi_state nor clear this->pi_state because the - * waiter needs the pi_state for cleaning up the - * user space value. It will drop the refcount - * after doing so. this::requeue_state is updated - * in the wakeup as well. - */ - requeue_pi_wake_futex(this, &key2, hb2); - task_count++; - } else if (!ret) { - /* Waiter is queued, move it to hb2 */ - requeue_futex(this, hb1, hb2, &key2); - futex_requeue_pi_complete(this, 0); - task_count++; - } else { - /* - * rt_mutex_start_proxy_lock() detected a potential - * deadlock when we tried to queue that waiter. - * Drop the pi_state reference which we took above - * and remove the pointer to the state from the - * waiters futex_q object. - */ - this->pi_state = NULL; - put_pi_state(pi_state); - futex_requeue_pi_complete(this, ret); - /* - * We stop queueing more waiters and let user space - * deal with the mess. - */ - break; - } - } - - /* - * We took an extra initial reference to the pi_state in - * futex_proxy_trylock_atomic(). We need to drop it here again. - */ - put_pi_state(pi_state); - -out_unlock: - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); - hb_waiters_dec(hb2); - return ret ? ret : task_count; -} - -/* The key must be already stored in q->key. */ -static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) - __acquires(&hb->lock) -{ - struct futex_hash_bucket *hb; - - hb = hash_futex(&q->key); - - /* - * Increment the counter before taking the lock so that - * a potential waker won't miss a to-be-slept task that is - * waiting for the spinlock. This is safe as all queue_lock() - * users end up calling queue_me(). Similarly, for housekeeping, - * decrement the counter at queue_unlock() when some error has - * occurred and we don't end up adding the task to the list. - */ - hb_waiters_inc(hb); /* implies smp_mb(); (A) */ - - q->lock_ptr = &hb->lock; - - spin_lock(&hb->lock); - return hb; -} - -static inline void -queue_unlock(struct futex_hash_bucket *hb) - __releases(&hb->lock) -{ - spin_unlock(&hb->lock); - hb_waiters_dec(hb); -} - -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) -{ - int prio; - - /* - * The priority used to register this element is - * - either the real thread-priority for the real-time threads - * (i.e. threads with a priority lower than MAX_RT_PRIO) - * - or MAX_RT_PRIO for non-RT threads. - * Thus, all RT-threads are woken first in priority order, and - * the others are woken last, in FIFO order. - */ - prio = min(current->normal_prio, MAX_RT_PRIO); - - plist_node_init(&q->list, prio); - plist_add(&q->list, &hb->chain); - q->task = current; -} - -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) -{ - __queue_me(q, hb); - spin_unlock(&hb->lock); -} - -/** - * unqueue_me() - Remove the futex_q from its futex_hash_bucket - * @q: The futex_q to unqueue - * - * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must - * be paired with exactly one earlier call to queue_me(). - * - * Return: - * - 1 - if the futex_q was still queued (and we removed unqueued it); - * - 0 - if the futex_q was already removed by the waking thread - */ -static int unqueue_me(struct futex_q *q) -{ - spinlock_t *lock_ptr; - int ret = 0; - - /* In the common case we don't take the spinlock, which is nice. */ -retry: - /* - * q->lock_ptr can change between this read and the following spin_lock. - * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and - * optimizing lock_ptr out of the logic below. - */ - lock_ptr = READ_ONCE(q->lock_ptr); - if (lock_ptr != NULL) { - spin_lock(lock_ptr); - /* - * q->lock_ptr can change between reading it and - * spin_lock(), causing us to take the wrong lock. This - * corrects the race condition. - * - * Reasoning goes like this: if we have the wrong lock, - * q->lock_ptr must have changed (maybe several times) - * between reading it and the spin_lock(). It can - * change again after the spin_lock() but only if it was - * already changed before the spin_lock(). It cannot, - * however, change back to the original value. Therefore - * we can detect whether we acquired the correct lock. - */ - if (unlikely(lock_ptr != q->lock_ptr)) { - spin_unlock(lock_ptr); - goto retry; - } - __unqueue_futex(q); - - BUG_ON(q->pi_state); - - spin_unlock(lock_ptr); - ret = 1; - } - - return ret; -} - -/* - * PI futexes can not be requeued and must remove themselves from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. - */ -static void unqueue_me_pi(struct futex_q *q) -{ - __unqueue_futex(q); - - BUG_ON(!q->pi_state); - put_pi_state(q->pi_state); - q->pi_state = NULL; -} - -static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *argowner) -{ - struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner, *newowner; - u32 uval, curval, newval, newtid; - int err = 0; - - oldowner = pi_state->owner; - - /* - * We are here because either: - * - * - we stole the lock and pi_state->owner needs updating to reflect - * that (@argowner == current), - * - * or: - * - * - someone stole our lock and we need to fix things to point to the - * new owner (@argowner == NULL). - * - * Either way, we have to replace the TID in the user space variable. - * This must be atomic as we have to preserve the owner died bit here. - * - * Note: We write the user space value _before_ changing the pi_state - * because we can fault here. Imagine swapped out pages or a fork - * that marked all the anonymous memory readonly for cow. - * - * Modifying pi_state _before_ the user space value would leave the - * pi_state in an inconsistent state when we fault here, because we - * need to drop the locks to handle the fault. This might be observed - * in the PID checks when attaching to PI state . - */ -retry: - if (!argowner) { - if (oldowner != current) { - /* - * We raced against a concurrent self; things are - * already fixed up. Nothing to do. - */ - return 0; - } - - if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { - /* We got the lock. pi_state is correct. Tell caller. */ - return 1; - } - - /* - * The trylock just failed, so either there is an owner or - * there is a higher priority waiter than this one. - */ - newowner = rt_mutex_owner(&pi_state->pi_mutex); - /* - * If the higher priority waiter has not yet taken over the - * rtmutex then newowner is NULL. We can't return here with - * that state because it's inconsistent vs. the user space - * state. So drop the locks and try again. It's a valid - * situation and not any different from the other retry - * conditions. - */ - if (unlikely(!newowner)) { - err = -EAGAIN; - goto handle_err; - } - } else { - WARN_ON_ONCE(argowner != current); - if (oldowner == current) { - /* - * We raced against a concurrent self; things are - * already fixed up. Nothing to do. - */ - return 1; - } - newowner = argowner; - } - - newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; - - err = get_futex_value_locked(&uval, uaddr); - if (err) - goto handle_err; - - for (;;) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - - err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); - if (err) - goto handle_err; - - if (curval == uval) - break; - uval = curval; - } - - /* - * We fixed up user space. Now we need to fix the pi_state - * itself. - */ - pi_state_update_owner(pi_state, newowner); - - return argowner == current; - - /* - * In order to reschedule or handle a page fault, we need to drop the - * locks here. In the case of a fault, this gives the other task - * (either the highest priority waiter itself or the task which stole - * the rtmutex) the chance to try the fixup of the pi_state. So once we - * are back from handling the fault we need to check the pi_state after - * reacquiring the locks and before trying to do another fixup. When - * the fixup has been done already we simply return. - * - * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely - * drop hb->lock since the caller owns the hb -> futex_q relation. - * Dropping the pi_mutex->wait_lock requires the state revalidate. - */ -handle_err: - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(q->lock_ptr); - - switch (err) { - case -EFAULT: - err = fault_in_user_writeable(uaddr); - break; - - case -EAGAIN: - cond_resched(); - err = 0; - break; - - default: - WARN_ON_ONCE(1); - break; - } - - spin_lock(q->lock_ptr); - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - - /* - * Check if someone else fixed it for us: - */ - if (pi_state->owner != oldowner) - return argowner == current; - - /* Retry if err was -EAGAIN or the fault in succeeded */ - if (!err) - goto retry; - - /* - * fault_in_user_writeable() failed so user state is immutable. At - * best we can make the kernel state consistent but user state will - * be most likely hosed and any subsequent unlock operation will be - * rejected due to PI futex rule [10]. - * - * Ensure that the rtmutex owner is also the pi_state owner despite - * the user space value claiming something different. There is no - * point in unlocking the rtmutex if current is the owner as it - * would need to wait until the next waiter has taken the rtmutex - * to guarantee consistent state. Keep it simple. Userspace asked - * for this wreckaged state. - * - * The rtmutex has an owner - either current or some other - * task. See the EAGAIN loop above. - */ - pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); - - return err; -} - -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *argowner) -{ - struct futex_pi_state *pi_state = q->pi_state; - int ret; - - lockdep_assert_held(q->lock_ptr); - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - ret = __fixup_pi_state_owner(uaddr, q, argowner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; -} - -static long futex_wait_restart(struct restart_block *restart); - -/** - * fixup_owner() - Post lock pi_state and corner case management - * @uaddr: user address of the futex - * @q: futex_q (contains pi_state and access to the rt_mutex) - * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) - * - * After attempting to lock an rt_mutex, this function is called to cleanup - * the pi_state owner as well as handle race conditions that may allow us to - * acquire the lock. Must be called with the hb lock held. - * - * Return: - * - 1 - success, lock taken; - * - 0 - success, lock not taken; - * - <0 - on error (-EFAULT) - */ -static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) -{ - if (locked) { - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case: - * - * Speculative pi_state->owner read (we don't hold wait_lock); - * since we own the lock pi_state->owner == current is the - * stable state, anything else needs more attention. - */ - if (q->pi_state->owner != current) - return fixup_pi_state_owner(uaddr, q, current); - return 1; - } - - /* - * If we didn't get the lock; check if anybody stole it from us. In - * that case, we need to fix up the uval to point to them instead of - * us, otherwise bad things happen. [10] - * - * Another speculative read; pi_state->owner == current is unstable - * but needs our attention. - */ - if (q->pi_state->owner == current) - return fixup_pi_state_owner(uaddr, q, NULL); - - /* - * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. Warn and establish consistent state. - */ - if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) - return fixup_pi_state_owner(uaddr, q, current); - - return 0; -} - -/** - * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal - * @hb: the futex hash bucket, must be locked by the caller - * @q: the futex_q to queue up on - * @timeout: the prepared hrtimer_sleeper, or null for no timeout - */ -static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) -{ - /* - * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using smp_store_mb() and - * queue_me() calls spin_unlock() upon completion, both serializing - * access to the hash list and forcing another memory barrier. - */ - set_current_state(TASK_INTERRUPTIBLE); - queue_me(q, hb); - - /* Arm the timer */ - if (timeout) - hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); - - /* - * If we have been removed from the hash list, then another task - * has tried to wake us, and we can skip the call to schedule(). - */ - if (likely(!plist_node_empty(&q->list))) { - /* - * If the timer has already expired, current will already be - * flagged for rescheduling. Only call schedule if there - * is no timeout, or if it has yet to expire. - */ - if (!timeout || timeout->task) - freezable_schedule(); - } - __set_current_state(TASK_RUNNING); -} - -/** - * futex_wait_setup() - Prepare to wait on a futex - * @uaddr: the futex userspace address - * @val: the expected value - * @flags: futex flags (FLAGS_SHARED, etc.) - * @q: the associated futex_q - * @hb: storage for hash_bucket pointer to be returned to caller - * - * Setup the futex_q and locate the hash_bucket. Get the futex value and - * compare it with the expected value. Handle atomic faults internally. - * Return with the hb lock held on success, and unlocked on failure. - * - * Return: - * - 0 - uaddr contains val and hb has been locked; - * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked - */ -static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) -{ - u32 uval; - int ret; - - /* - * Access the page AFTER the hash-bucket is locked. - * Order is important: - * - * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); - * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } - * - * The basic logical guarantee of a futex is that it blocks ONLY - * if cond(var) is known to be true at the time of blocking, for - * any cond. If we locked the hash-bucket after testing *uaddr, that - * would open a race condition where we could block indefinitely with - * cond(var) false, which would violate the guarantee. - * - * On the other hand, we insert q and release the hash-bucket only - * after testing *uaddr. This guarantees that futex_wait() will NOT - * absorb a wakeup if *uaddr does not match the desired values - * while the syscall executes. - */ -retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); - if (unlikely(ret != 0)) - return ret; - -retry_private: - *hb = queue_lock(q); - - ret = get_futex_value_locked(&uval, uaddr); - - if (ret) { - queue_unlock(*hb); - - ret = get_user(uval, uaddr); - if (ret) - return ret; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - goto retry; - } - - if (uval != val) { - queue_unlock(*hb); - ret = -EWOULDBLOCK; - } - - return ret; -} - -static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) -{ - struct hrtimer_sleeper timeout, *to; - struct restart_block *restart; - struct futex_hash_bucket *hb; - struct futex_q q = futex_q_init; - int ret; - - if (!bitset) - return -EINVAL; - q.bitset = bitset; - - to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); -retry: - /* - * Prepare to wait on uaddr. On success, it holds hb->lock and q - * is initialized. - */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); - if (ret) - goto out; - - /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to); - - /* If we were woken (and unqueued), we succeeded, whatever. */ - ret = 0; - if (!unqueue_me(&q)) - goto out; - ret = -ETIMEDOUT; - if (to && !to->task) - goto out; - - /* - * We expect signal_pending(current), but we might be the - * victim of a spurious wakeup as well. - */ - if (!signal_pending(current)) - goto retry; - - ret = -ERESTARTSYS; - if (!abs_time) - goto out; - - restart = ¤t->restart_block; - restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = *abs_time; - restart->futex.bitset = bitset; - restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; - - ret = set_restart_fn(restart, futex_wait_restart); - -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret; -} - - -static long futex_wait_restart(struct restart_block *restart) -{ - u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; - - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; - tp = &t; - } - restart->fn = do_no_restart_syscall; - - return (long)futex_wait(uaddr, restart->futex.flags, - restart->futex.val, tp, restart->futex.bitset); -} - - -/* - * Userspace tried a 0 -> TID atomic transition of the futex value - * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block as a consequence of relying - * on rt-mutexes, it does PI, etc. (Due to races the kernel might see - * a 0 value of the futex too.). - * - * Also serves as futex trylock_pi()'ing, and due semantics. - */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, - ktime_t *time, int trylock) -{ - struct hrtimer_sleeper timeout, *to; - struct task_struct *exiting = NULL; - struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; - struct futex_q q = futex_q_init; - int res, ret; - - if (!IS_ENABLED(CONFIG_FUTEX_PI)) - return -ENOSYS; - - if (refill_pi_state_cache()) - return -ENOMEM; - - to = futex_setup_timer(time, &timeout, flags, 0); - -retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); - if (unlikely(ret != 0)) - goto out; - -retry_private: - hb = queue_lock(&q); - - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, - &exiting, 0); - if (unlikely(ret)) { - /* - * Atomic work succeeded and we got the lock, - * or failed. Either way, we do _not_ block. - */ - switch (ret) { - case 1: - /* We got the lock. */ - ret = 0; - goto out_unlock_put_key; - case -EFAULT: - goto uaddr_faulted; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Task is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - queue_unlock(hb); - /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. - */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock_put_key; - } - } - - WARN_ON(!q.pi_state); - - /* - * Only actually queue now that the atomic ops are done: - */ - __queue_me(&q, hb); - - if (trylock) { - ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - goto no_block; - } - - rt_mutex_init_waiter(&rt_waiter); - - /* - * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not - * hold it while doing rt_mutex_start_proxy(), because then it will - * include hb->lock in the blocking chain, even through we'll not in - * fact hold it while blocking. This will lead it to report -EDEADLK - * and BUG when futex_unlock_pi() interleaves with this. - * - * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling __rt_mutex_start_proxy_lock(). This - * interleaves with futex_unlock_pi() -- which does a similar lock - * handoff -- such that the latter can observe the futex_q::pi_state - * before __rt_mutex_start_proxy_lock() is done. - */ - raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); - spin_unlock(q.lock_ptr); - /* - * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter - * such that futex_unlock_pi() is guaranteed to observe the waiter when - * it sees the futex_q::pi_state. - */ - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); - raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); - - if (ret) { - if (ret == 1) - ret = 0; - goto cleanup; - } - - if (unlikely(to)) - hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); - - ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); - -cleanup: - spin_lock(q.lock_ptr); - /* - * If we failed to acquire the lock (deadlock/signal/timeout), we must - * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait - * lists consistent. - * - * In particular; it is important that futex_unlock_pi() can not - * observe this inconsistency. - */ - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) - ret = 0; - -no_block: - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_owner(uaddr, &q, !ret); - /* - * If fixup_owner() returned an error, propagate that. If it acquired - * the lock, clear our -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - unqueue_me_pi(&q); - spin_unlock(q.lock_ptr); - goto out; - -out_unlock_put_key: - queue_unlock(hb); - -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret != -EINTR ? ret : -ERESTARTNOINTR; - -uaddr_faulted: - queue_unlock(hb); - - ret = fault_in_user_writeable(uaddr); - if (ret) - goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - goto retry; -} - -/* - * Userspace attempted a TID -> 0 atomic transition, and failed. - * This is the in-kernel slowpath: we look up the PI state (if any), - * and do the rt-mutex unlock. - */ -static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) -{ - u32 curval, uval, vpid = task_pid_vnr(current); - union futex_key key = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb; - struct futex_q *top_waiter; - int ret; - - if (!IS_ENABLED(CONFIG_FUTEX_PI)) - return -ENOSYS; - -retry: - if (get_user(uval, uaddr)) - return -EFAULT; - /* - * We release only a lock we actually own: - */ - if ((uval & FUTEX_TID_MASK) != vpid) - return -EPERM; - - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); - if (ret) - return ret; - - hb = hash_futex(&key); - spin_lock(&hb->lock); - - /* - * Check waiters first. We do not trust user space values at - * all and we at least want to know if user space fiddled - * with the futex value instead of blindly unlocking. - */ - top_waiter = futex_top_waiter(hb, &key); - if (top_waiter) { - struct futex_pi_state *pi_state = top_waiter->pi_state; - - ret = -EINVAL; - if (!pi_state) - goto out_unlock; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - goto out_unlock; - - get_pi_state(pi_state); - /* - * By taking wait_lock while still holding hb->lock, we ensure - * there is no point where we hold neither; and therefore - * wake_futex_pi() must observe a state consistent with what we - * observed. - * - * In particular; this forces __rt_mutex_start_proxy() to - * complete such that we're guaranteed to observe the - * rt_waiter. Also see the WARN in wake_futex_pi(). - */ - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); - - /* drops pi_state->pi_mutex.wait_lock */ - ret = wake_futex_pi(uaddr, uval, pi_state); - - put_pi_state(pi_state); - - /* - * Success, we're done! No tricky corner cases. - */ - if (!ret) - return ret; - /* - * The atomic access to the futex value generated a - * pagefault, so retry the user-access and the wakeup: - */ - if (ret == -EFAULT) - goto pi_faulted; - /* - * A unconditional UNLOCK_PI op raced against a waiter - * setting the FUTEX_WAITERS bit. Try again. - */ - if (ret == -EAGAIN) - goto pi_retry; - /* - * wake_futex_pi has detected invalid state. Tell user - * space. - */ - return ret; - } - - /* - * We have no kernel internal state, i.e. no waiters in the - * kernel. Waiters which are about to queue themselves are stuck - * on hb->lock. So we can safely ignore them. We do neither - * preserve the WAITERS bit not the OWNER_DIED one. We are the - * owner. - */ - if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { - spin_unlock(&hb->lock); - switch (ret) { - case -EFAULT: - goto pi_faulted; - - case -EAGAIN: - goto pi_retry; - - default: - WARN_ON_ONCE(1); - return ret; - } - } - - /* - * If uval has changed, let user space handle it. - */ - ret = (curval == uval) ? 0 : -EAGAIN; - -out_unlock: - spin_unlock(&hb->lock); - return ret; - -pi_retry: - cond_resched(); - goto retry; - -pi_faulted: - - ret = fault_in_user_writeable(uaddr); - if (!ret) - goto retry; - - return ret; -} - -/** - * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex - * @hb: the hash_bucket futex_q was original enqueued on - * @q: the futex_q woken while waiting to be requeued - * @timeout: the timeout associated with the wait (NULL if none) - * - * Determine the cause for the early wakeup. - * - * Return: - * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR - */ -static inline -int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, - struct futex_q *q, - struct hrtimer_sleeper *timeout) -{ - int ret; - - /* - * With the hb lock held, we avoid races while we process the wakeup. - * We only need to hold hb (and not hb2) to ensure atomicity as the - * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. - * It can't be requeued from uaddr2 to something else since we don't - * support a PI aware source futex for requeue. - */ - WARN_ON_ONCE(&hb->lock != q->lock_ptr); - - /* - * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. - */ - plist_del(&q->list, &hb->chain); - hb_waiters_dec(hb); - - /* Handle spurious wakeups gracefully */ - ret = -EWOULDBLOCK; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - else if (signal_pending(current)) - ret = -ERESTARTNOINTR; - return ret; -} - -/** - * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initially wait on (non-pi) - * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. - * @val: the expected value of uaddr - * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @uaddr2: the pi futex we will take prior to returning to user-space - * - * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake - * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to - * userspace. This ensures the rt_mutex maintains an owner when it has waiters; - * without one, the pi logic would not know which task to boost/deboost, if - * there was a need to. - * - * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following-- - * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue - * 3) signal - * 4) timeout - * - * If 3, cleanup and return -ERESTARTNOINTR. - * - * If 2, we may then block on trying to take the rt_mutex and return via: - * 5) successful lock - * 6) signal - * 7) timeout - * 8) other lock acquisition failure - * - * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). - * - * If 4 or 7, we cleanup and return with -ETIMEDOUT. - * - * Return: - * - 0 - On success; - * - <0 - On error - */ -static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - u32 val, ktime_t *abs_time, u32 bitset, - u32 __user *uaddr2) -{ - struct hrtimer_sleeper timeout, *to; - struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; - union futex_key key2 = FUTEX_KEY_INIT; - struct futex_q q = futex_q_init; - struct rt_mutex_base *pi_mutex; - int res, ret; - - if (!IS_ENABLED(CONFIG_FUTEX_PI)) - return -ENOSYS; - - if (uaddr == uaddr2) - return -EINVAL; - - if (!bitset) - return -EINVAL; - - to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); - - /* - * The waiter is allocated on our stack, manipulated by the requeue - * code while we sleep on uaddr. - */ - rt_mutex_init_waiter(&rt_waiter); - - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); - if (unlikely(ret != 0)) - goto out; - - q.bitset = bitset; - q.rt_waiter = &rt_waiter; - q.requeue_pi_key = &key2; - - /* - * Prepare to wait on uaddr. On success, it holds hb->lock and q - * is initialized. - */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); - if (ret) - goto out; - - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (match_futex(&q.key, &key2)) { - queue_unlock(hb); - ret = -EINVAL; - goto out; - } - - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue_me(hb, &q, to); - - switch (futex_requeue_pi_wakeup_sync(&q)) { - case Q_REQUEUE_PI_IGNORE: - /* The waiter is still on uaddr1 */ - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, to); - spin_unlock(&hb->lock); - break; - - case Q_REQUEUE_PI_LOCKED: - /* The requeue acquired the lock */ - if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); - ret = fixup_owner(uaddr2, &q, true); - /* - * Drop the reference to the pi state which the - * requeue_pi() code acquired for us. - */ - put_pi_state(q.pi_state); - spin_unlock(q.lock_ptr); - /* - * Adjust the return value. It's either -EFAULT or - * success (1) but the caller expects 0 for success. - */ - ret = ret < 0 ? ret : 0; - } - break; - - case Q_REQUEUE_PI_DONE: - /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ - pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - - /* Current is not longer pi_blocked_on */ - spin_lock(q.lock_ptr); - if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) - ret = 0; - - debug_rt_mutex_free_waiter(&rt_waiter); - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_owner(uaddr2, &q, !ret); - /* - * If fixup_owner() returned an error, propagate that. If it - * acquired the lock, clear -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - unqueue_me_pi(&q); - spin_unlock(q.lock_ptr); - - if (ret == -EINTR) { - /* - * We've already been requeued, but cannot restart - * by calling futex_lock_pi() directly. We could - * restart this syscall, but it would detect that - * the user space "val" changed and return - * -EWOULDBLOCK. Save the overhead of the restart - * and return -EWOULDBLOCK directly. - */ - ret = -EWOULDBLOCK; - } - break; - default: - BUG(); - } - -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret; -} - -/* - * Support for robust futexes: the kernel cleans up held futexes at - * thread exit time. - * - * Implementation: user-space maintains a per-thread list of locks it - * is holding. Upon do_exit(), the kernel carefully walks this list, - * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is - * always manipulated with the lock held, so the list is private and - * per-thread. Userspace also maintains a per-thread 'list_op_pending' - * field, to allow the kernel to clean up if the thread dies after - * acquiring the lock, but just before it could have added itself to - * the list. There can only be one such pending lock. - */ - -/** - * sys_set_robust_list() - Set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects - */ -SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, - size_t, len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - /* - * The kernel knows only one size for now: - */ - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->robust_list = head; - - return 0; -} - -/** - * sys_get_robust_list() - Get the robust-futex list head of a task - * @pid: pid of the process [zero for current task] - * @head_ptr: pointer to a list-head pointer, the kernel fills it in - * @len_ptr: pointer to a length field, the kernel fills in the header size - */ -SYSCALL_DEFINE3(get_robust_list, int, pid, - struct robust_list_head __user * __user *, head_ptr, - size_t __user *, len_ptr) -{ - struct robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; - - head = p->robust_list; - rcu_read_unlock(); - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(head, head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; -} - -/* Constants for the pending_op argument of handle_futex_death */ -#define HANDLE_DEATH_PENDING true -#define HANDLE_DEATH_LIST false - -/* - * Process a futex-list entry, check whether it's owned by the - * dying task, and do notification if so: - */ -static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, - bool pi, bool pending_op) -{ - u32 uval, nval, mval; - int err; - - /* Futex address must be 32bit aligned */ - if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) - return -1; - -retry: - if (get_user(uval, uaddr)) - return -1; - - /* - * Special case for regular (non PI) futexes. The unlock path in - * user space has two race scenarios: - * - * 1. The unlock path releases the user space futex value and - * before it can execute the futex() syscall to wake up - * waiters it is killed. - * - * 2. A woken up waiter is killed before it can acquire the - * futex in user space. - * - * In both cases the TID validation below prevents a wakeup of - * potential waiters which can cause these waiters to block - * forever. - * - * In both cases the following conditions are met: - * - * 1) task->robust_list->list_op_pending != NULL - * @pending_op == true - * 2) User space futex value == 0 - * 3) Regular futex: @pi == false - * - * If these conditions are met, it is safe to attempt waking up a - * potential waiter without touching the user space futex value and - * trying to set the OWNER_DIED bit. The user space futex value is - * uncontended and the rest of the user space mutex state is - * consistent, so a woken waiter will just take over the - * uncontended futex. Setting the OWNER_DIED bit would create - * inconsistent state and malfunction of the user space owner died - * handling. - */ - if (pending_op && !pi && !uval) { - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); - return 0; - } - - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) - return 0; - - /* - * Ok, this dying thread is truly holding a futex - * of interest. Set the OWNER_DIED bit atomically - * via cmpxchg, and if the value had FUTEX_WAITERS - * set, wake up a waiter (if any). (We have to do a - * futex_wake() even if OWNER_DIED is already set - - * to handle the rare but possible case of recursive - * thread-death.) The rest of the cleanup is done in - * userspace. - */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - - /* - * We are not holding a lock here, but we want to have - * the pagefault_disable/enable() protection because - * we want to handle the fault gracefully. If the - * access fails we try to fault in the futex with R/W - * verification via get_user_pages. get_user() above - * does not guarantee R/W access. If that fails we - * give up and leave the futex locked. - */ - if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { - switch (err) { - case -EFAULT: - if (fault_in_user_writeable(uaddr)) - return -1; - goto retry; - - case -EAGAIN: - cond_resched(); - goto retry; - - default: - WARN_ON_ONCE(1); - return err; - } - } - - if (nval != uval) - goto retry; - - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); - - return 0; -} - -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int fetch_robust_entry(struct robust_list __user **entry, - struct robust_list __user * __user *head, - unsigned int *pi) -{ - unsigned long uentry; - - if (get_user(uentry, (unsigned long __user *)head)) - return -EFAULT; - - *entry = (void __user *)(uentry & ~1UL); - *pi = uentry & 1; - - return 0; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -static void exit_robust_list(struct task_struct *curr) -{ - struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int next_pi; - unsigned long futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (fetch_robust_entry(&entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * don't process it twice: - */ - if (entry != pending) { - if (handle_futex_death((void __user *)entry + futex_offset, - curr, pi, HANDLE_DEATH_LIST)) - return; - } - if (rc) - return; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - - if (pending) { - handle_futex_death((void __user *)pending + futex_offset, - curr, pip, HANDLE_DEATH_PENDING); - } -} - -static void futex_cleanup(struct task_struct *tsk) -{ - if (unlikely(tsk->robust_list)) { - exit_robust_list(tsk); - tsk->robust_list = NULL; - } - -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { - compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; - } -#endif - - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); -} - -/** - * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD - * @tsk: task to set the state on - * - * Set the futex exit state of the task lockless. The futex waiter code - * observes that state when a task is exiting and loops until the task has - * actually finished the futex cleanup. The worst case for this is that the - * waiter runs through the wait loop until the state becomes visible. - * - * This is called from the recursive fault handling path in do_exit(). - * - * This is best effort. Either the futex exit code has run already or - * not. If the OWNER_DIED bit has been set on the futex then the waiter can - * take it over. If not, the problem is pushed back to user space. If the - * futex exit code did not run yet, then an already queued waiter might - * block forever, but there is nothing which can be done about that. - */ -void futex_exit_recursive(struct task_struct *tsk) -{ - /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ - if (tsk->futex_state == FUTEX_STATE_EXITING) - mutex_unlock(&tsk->futex_exit_mutex); - tsk->futex_state = FUTEX_STATE_DEAD; -} - -static void futex_cleanup_begin(struct task_struct *tsk) -{ - /* - * Prevent various race issues against a concurrent incoming waiter - * including live locks by forcing the waiter to block on - * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in - * attach_to_pi_owner(). - */ - mutex_lock(&tsk->futex_exit_mutex); - - /* - * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. - * - * This ensures that all subsequent checks of tsk->futex_state in - * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with - * tsk->pi_lock held. - * - * It guarantees also that a pi_state which was queued right before - * the state change under tsk->pi_lock by a concurrent waiter must - * be observed in exit_pi_state_list(). - */ - raw_spin_lock_irq(&tsk->pi_lock); - tsk->futex_state = FUTEX_STATE_EXITING; - raw_spin_unlock_irq(&tsk->pi_lock); -} - -static void futex_cleanup_end(struct task_struct *tsk, int state) -{ - /* - * Lockless store. The only side effect is that an observer might - * take another loop until it becomes visible. - */ - tsk->futex_state = state; - /* - * Drop the exit protection. This unblocks waiters which observed - * FUTEX_STATE_EXITING to reevaluate the state. - */ - mutex_unlock(&tsk->futex_exit_mutex); -} - -void futex_exec_release(struct task_struct *tsk) -{ - /* - * The state handling is done for consistency, but in the case of - * exec() there is no way to prevent further damage as the PID stays - * the same. But for the unlikely and arguably buggy case that a - * futex is held on exec(), this provides at least as much state - * consistency protection which is possible. - */ - futex_cleanup_begin(tsk); - futex_cleanup(tsk); - /* - * Reset the state to FUTEX_STATE_OK. The task is alive and about - * exec a new binary. - */ - futex_cleanup_end(tsk, FUTEX_STATE_OK); -} - -void futex_exit_release(struct task_struct *tsk) -{ - futex_cleanup_begin(tsk); - futex_cleanup(tsk); - futex_cleanup_end(tsk, FUTEX_STATE_DEAD); -} - -long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - u32 __user *uaddr2, u32 val2, u32 val3) -{ - int cmd = op & FUTEX_CMD_MASK; - unsigned int flags = 0; - - if (!(op & FUTEX_PRIVATE_FLAG)) - flags |= FLAGS_SHARED; - - if (op & FUTEX_CLOCK_REALTIME) { - flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && - cmd != FUTEX_LOCK_PI2) - return -ENOSYS; - } - - switch (cmd) { - case FUTEX_LOCK_PI: - case FUTEX_LOCK_PI2: - case FUTEX_UNLOCK_PI: - case FUTEX_TRYLOCK_PI: - case FUTEX_WAIT_REQUEUE_PI: - case FUTEX_CMP_REQUEUE_PI: - if (!futex_cmpxchg_enabled) - return -ENOSYS; - } - - switch (cmd) { - case FUTEX_WAIT: - val3 = FUTEX_BITSET_MATCH_ANY; - fallthrough; - case FUTEX_WAIT_BITSET: - return futex_wait(uaddr, flags, val, timeout, val3); - case FUTEX_WAKE: - val3 = FUTEX_BITSET_MATCH_ANY; - fallthrough; - case FUTEX_WAKE_BITSET: - return futex_wake(uaddr, flags, val, val3); - case FUTEX_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); - case FUTEX_CMP_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); - case FUTEX_WAKE_OP: - return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); - case FUTEX_LOCK_PI: - flags |= FLAGS_CLOCKRT; - fallthrough; - case FUTEX_LOCK_PI2: - return futex_lock_pi(uaddr, flags, timeout, 0); - case FUTEX_UNLOCK_PI: - return futex_unlock_pi(uaddr, flags); - case FUTEX_TRYLOCK_PI: - return futex_lock_pi(uaddr, flags, NULL, 1); - case FUTEX_WAIT_REQUEUE_PI: - val3 = FUTEX_BITSET_MATCH_ANY; - return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, - uaddr2); - case FUTEX_CMP_REQUEUE_PI: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); - } - return -ENOSYS; -} - -static __always_inline bool futex_cmd_has_timeout(u32 cmd) -{ - switch (cmd) { - case FUTEX_WAIT: - case FUTEX_LOCK_PI: - case FUTEX_LOCK_PI2: - case FUTEX_WAIT_BITSET: - case FUTEX_WAIT_REQUEUE_PI: - return true; - } - return false; -} - -static __always_inline int -futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) -{ - if (!timespec64_valid(ts)) - return -EINVAL; - - *t = timespec64_to_ktime(*ts); - if (cmd == FUTEX_WAIT) - *t = ktime_add_safe(ktime_get(), *t); - else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) - *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); - return 0; -} - -SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - const struct __kernel_timespec __user *, utime, - u32 __user *, uaddr2, u32, val3) -{ - int ret, cmd = op & FUTEX_CMD_MASK; - ktime_t t, *tp = NULL; - struct timespec64 ts; - - if (utime && futex_cmd_has_timeout(cmd)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) - return -EFAULT; - ret = futex_init_timeout(cmd, op, &ts, &t); - if (ret) - return ret; - tp = &t; - } - - return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); -} - -#ifdef CONFIG_COMPAT -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int -compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, unsigned int *pi) -{ - if (get_user(*uentry, head)) - return -EFAULT; - - *entry = compat_ptr((*uentry) & ~1); - *pi = (unsigned int)(*uentry) & 1; - - return 0; -} - -static void __user *futex_uaddr(struct robust_list __user *entry, - compat_long_t futex_offset) -{ - compat_uptr_t base = ptr_to_compat(entry); - void __user *uaddr = compat_ptr(base + futex_offset); - - return uaddr; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -static void compat_exit_robust_list(struct task_struct *curr) -{ - struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int next_pi; - compat_uptr_t uentry, next_uentry, upending; - compat_long_t futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (compat_fetch_robust_entry(&upending, &pending, - &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != (struct robust_list __user *) &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = compat_fetch_robust_entry(&next_uentry, &next_entry, - (compat_uptr_t __user *)&entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * dont process it twice: - */ - if (entry != pending) { - void __user *uaddr = futex_uaddr(entry, futex_offset); - - if (handle_futex_death(uaddr, curr, pi, - HANDLE_DEATH_LIST)) - return; - } - if (rc) - return; - uentry = next_uentry; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - if (pending) { - void __user *uaddr = futex_uaddr(pending, futex_offset); - - handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); - } -} - -COMPAT_SYSCALL_DEFINE2(set_robust_list, - struct compat_robust_list_head __user *, head, - compat_size_t, len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->compat_robust_list = head; - - return 0; -} - -COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - compat_uptr_t __user *, head_ptr, - compat_size_t __user *, len_ptr) -{ - struct compat_robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; - - head = p->compat_robust_list; - rcu_read_unlock(); - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; -} -#endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_COMPAT_32BIT_TIME -SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, - u32, val3) -{ - int ret, cmd = op & FUTEX_CMD_MASK; - ktime_t t, *tp = NULL; - struct timespec64 ts; - - if (utime && futex_cmd_has_timeout(cmd)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - ret = futex_init_timeout(cmd, op, &ts, &t); - if (ret) - return ret; - tp = &t; - } - - return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); -} -#endif /* CONFIG_COMPAT_32BIT_TIME */ - -static void __init futex_detect_cmpxchg(void) -{ -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG - u32 curval; - - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; -#endif -} - -static int __init futex_init(void) -{ - unsigned int futex_shift; - unsigned long i; - -#if CONFIG_BASE_SMALL - futex_hashsize = 16; -#else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); -#endif - - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, - futex_hashsize < 256 ? HASH_SMALL : 0, - &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; - - futex_detect_cmpxchg(); - - for (i = 0; i < futex_hashsize; i++) { - atomic_set(&futex_queues[i].waiters, 0); - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); - } - - return 0; -} -core_initcall(futex_init); diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile new file mode 100644 index 000000000000..b77188d1fa07 --- /dev/null +++ b/kernel/futex/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += core.o syscalls.o pi.o requeue.o waitwake.o diff --git a/kernel/futex/core.c b/kernel/futex/core.c new file mode 100644 index 000000000000..25d8a88b32e5 --- /dev/null +++ b/kernel/futex/core.c @@ -0,0 +1,1176 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar + * (C) Copyright 2003 Red Hat Inc, All Rights Reserved + * + * Removed page pinning, fix privately mapped COW pages and other cleanups + * (C) Copyright 2003, 2004 Jamie Lokier + * + * Robust futex support started by Ingo Molnar + * (C) Copyright 2006 Red Hat Inc, All Rights Reserved + * Thanks to Thomas Gleixner for suggestions, analysis and fixes. + * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> + * + * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + */ +#include <linux/compat.h> +#include <linux/jhash.h> +#include <linux/pagemap.h> +#include <linux/memblock.h> +#include <linux/fault-inject.h> +#include <linux/slab.h> + +#include "futex.h" +#include "../locking/rtmutex_common.h" + +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG +int __read_mostly futex_cmpxchg_enabled; +#endif + + +/* + * The base of the bucket array and its size are always used together + * (after initialization only in futex_hash()), so ensure that they + * reside in the same cacheline. + */ +static struct { + struct futex_hash_bucket *queues; + unsigned long hashsize; +} __futex_data __read_mostly __aligned(2*sizeof(long)); +#define futex_queues (__futex_data.queues) +#define futex_hashsize (__futex_data.hashsize) + + +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX + +static struct { + struct fault_attr attr; + + bool ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = false, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private); + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#endif /* CONFIG_FAIL_FUTEX */ + +/** + * futex_hash - Return the hash bucket in the global hash + * @key: Pointer to the futex key for which the hash is calculated + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket in the global hash. + */ +struct futex_hash_bucket *futex_hash(union futex_key *key) +{ + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, + key->both.offset); + + return &futex_queues[hash & (futex_hashsize - 1)]; +} + + +/** + * futex_setup_timer - set up the sleeping hrtimer. + * @time: ptr to the given timeout value + * @timeout: the hrtimer_sleeper structure to be set up + * @flags: futex flags + * @range_ns: optional range in ns + * + * Return: Initialized hrtimer_sleeper structure or NULL if no timeout + * value given + */ +struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) +{ + if (!time) + return NULL; + + hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + /* + * If range_ns is 0, calling hrtimer_set_expires_range_ns() is + * effectively the same as calling hrtimer_set_expires(). + */ + hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); + + return timeout; +} + +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that futex_match() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + +/** + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: FUTEX_READ, + * FUTEX_WRITE) + * + * Return: a negative error code or 0 + * + * The key words are stored in @key on success. + * + * For shared mappings (when @fshared), the key is: + * + * ( inode->i_sequence, page->index, offset_within_page ) + * + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. + * + * lock_page() might sleep, the caller should not hold a spinlock. + */ +int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) +{ + unsigned long address = (unsigned long)uaddr; + struct mm_struct *mm = current->mm; + struct page *page, *tail; + struct address_space *mapping; + int err, ro = 0; + + /* + * The futex address must be "naturally" aligned. + */ + key->both.offset = address % PAGE_SIZE; + if (unlikely((address % sizeof(u32)) != 0)) + return -EINVAL; + address -= key->both.offset; + + if (unlikely(!access_ok(uaddr, sizeof(u32)))) + return -EFAULT; + + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + + /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + key->private.mm = mm; + key->private.address = address; + return 0; + } + +again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); + /* + * If write access is not required (eg. FUTEX_WAIT), try + * and get read-only access. + */ + if (err == -EFAULT && rw == FUTEX_READ) { + err = get_user_pages_fast(address, 1, 0, &page); + ro = 1; + } + if (err < 0) + return err; + else + err = 0; + + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + * + * Mapping checks require the head page for any compound page so the + * head page and mapping is looked up now. For anonymous pages, it + * does not matter if the page splits in the future as the key is + * based on the address. For filesystem-backed pages, the tail is + * required as the index of the page determines the key. For + * base pages, there is no tail page and tail == page. + */ + tail = page; + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + + /* + * If page->mapping is NULL, then it cannot be a PageAnon + * page; but it might be the ZERO_PAGE or in the gate area or + * in a special mapping (all cases which we are happy to fail); + * or it may have been a good file page when get_user_pages_fast + * found it, but truncated or holepunched or subjected to + * invalidate_complete_page2 before we got the page lock (also + * cases which we are happy to fail). And we hold a reference, + * so refcount care in invalidate_complete_page's remove_mapping + * prevents drop_caches from setting mapping to NULL beneath us. + * + * The case we do have to guard against is when memory pressure made + * shmem_writepage move it from filecache to swapcache beneath us: + * an unlikely race, but we do need to retry for page->mapping. + */ + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; + unlock_page(page); + put_page(page); + + if (shmem_swizzled) + goto again; + + return -EFAULT; + } + + /* + * Private mappings are handled in a simple way. + * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * + * NOTE: When userspace waits on a MAP_SHARED mapping, even if + * it's a read-only handle, it's expected that futexes attach to + * the object not the particular process. + */ + if (PageAnon(page)) { + /* + * A RO anonymous page will never change and thus doesn't make + * sense for futex operations. + */ + if (unlikely(should_fail_futex(true)) || ro) { + err = -EFAULT; + goto out; + } + + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ + key->private.mm = mm; + key->private.address = address; + + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.i_seq = get_inode_sequence_number(inode); + key->shared.pgoff = page_to_pgoff(tail); + rcu_read_unlock(); + } + +out: + put_page(page); + return err; +} + +/** + * fault_in_user_writeable() - Fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non-destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +int fault_in_user_writeable(u32 __user *uaddr) +{ + struct mm_struct *mm = current->mm; + int ret; + + mmap_read_lock(mm); + ret = fixup_user_fault(mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE, NULL); + mmap_read_unlock(mm); + + return ret < 0 ? ret : 0; +} + +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (futex_match(&this->key, key)) + return this; + } + return NULL; +} + +int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) +{ + int ret; + + pagefault_disable(); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); + pagefault_enable(); + + return ret; +} + +int futex_get_value_locked(u32 *dest, u32 __user *from) +{ + int ret; + + pagefault_disable(); + ret = __get_user(*dest, from); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + +/** + * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status + * @exiting: Pointer to the exiting task + * + * Caller must hold a refcount on @exiting. + */ +void wait_for_owner_exiting(int ret, struct task_struct *exiting) +{ + if (ret != -EBUSY) { + WARN_ON_ONCE(exiting); + return; + } + + if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) + return; + + mutex_lock(&exiting->futex_exit_mutex); + /* + * No point in doing state checking here. If the waiter got here + * while the task was in exec()->exec_futex_release() then it can + * have any FUTEX_STATE_* value when the waiter has acquired the + * mutex. OK, if running, EXITING or DEAD if it reached exit() + * already. Highly unlikely and not a problem. Just one more round + * through the futex maze. + */ + mutex_unlock(&exiting->futex_exit_mutex); + + put_task_struct(exiting); +} + +/** + * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +void __futex_unqueue(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) + return; + lockdep_assert_held(q->lock_ptr); + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); + futex_hb_waiters_dec(hb); +} + +/* The key must be already stored in q->key. */ +struct futex_hash_bucket *futex_q_lock(struct futex_q *q) + __acquires(&hb->lock) +{ + struct futex_hash_bucket *hb; + + hb = futex_hash(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all futex_q_lock() + * users end up calling futex_queue(). Similarly, for housekeeping, + * decrement the counter at futex_q_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ + + q->lock_ptr = &hb->lock; + + spin_lock(&hb->lock); + return hb; +} + +void futex_q_unlock(struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + spin_unlock(&hb->lock); + futex_hb_waiters_dec(hb); +} + +void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +{ + int prio; + + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + */ + prio = min(current->normal_prio, MAX_RT_PRIO); + + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + q->task = current; +} + +/** + * futex_unqueue() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must + * be paired with exactly one earlier call to futex_queue(). + * + * Return: + * - 1 - if the futex_q was still queued (and we removed unqueued it); + * - 0 - if the futex_q was already removed by the waking thread + */ +int futex_unqueue(struct futex_q *q) +{ + spinlock_t *lock_ptr; + int ret = 0; + + /* In the common case we don't take the spinlock, which is nice. */ +retry: + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); + if (lock_ptr != NULL) { + spin_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } + __futex_unqueue(q); + + BUG_ON(q->pi_state); + + spin_unlock(lock_ptr); + ret = 1; + } + + return ret; +} + +/* + * PI futexes can not be requeued and must remove themselves from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. + */ +void futex_unqueue_pi(struct futex_q *q) +{ + __futex_unqueue(q); + + BUG_ON(!q->pi_state); + put_pi_state(q->pi_state); + q->pi_state = NULL; +} + +/* Constants for the pending_op argument of handle_futex_death */ +#define HANDLE_DEATH_PENDING true +#define HANDLE_DEATH_LIST false + +/* + * Process a futex-list entry, check whether it's owned by the + * dying task, and do notification if so: + */ +static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, + bool pi, bool pending_op) +{ + u32 uval, nval, mval; + int err; + + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; + +retry: + if (get_user(uval, uaddr)) + return -1; + + /* + * Special case for regular (non PI) futexes. The unlock path in + * user space has two race scenarios: + * + * 1. The unlock path releases the user space futex value and + * before it can execute the futex() syscall to wake up + * waiters it is killed. + * + * 2. A woken up waiter is killed before it can acquire the + * futex in user space. + * + * In both cases the TID validation below prevents a wakeup of + * potential waiters which can cause these waiters to block + * forever. + * + * In both cases the following conditions are met: + * + * 1) task->robust_list->list_op_pending != NULL + * @pending_op == true + * 2) User space futex value == 0 + * 3) Regular futex: @pi == false + * + * If these conditions are met, it is safe to attempt waking up a + * potential waiter without touching the user space futex value and + * trying to set the OWNER_DIED bit. The user space futex value is + * uncontended and the rest of the user space mutex state is + * consistent, so a woken waiter will just take over the + * uncontended futex. Setting the OWNER_DIED bit would create + * inconsistent state and malfunction of the user space owner died + * handling. + */ + if (pending_op && !pi && !uval) { + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; + } + + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { + switch (err) { + case -EFAULT: + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + + case -EAGAIN: + cond_resched(); + goto retry; + + default: + WARN_ON_ONCE(1); + return err; + } + } + + if (nval != uval) + goto retry; + + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi && (uval & FUTEX_WAITERS)) + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + + return 0; +} + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user * __user *head, + unsigned int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long __user *)head)) + return -EFAULT; + + *entry = (void __user *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +static void exit_robust_list(struct task_struct *curr) +{ + struct robust_list_head __user *head = curr->robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int next_pi; + unsigned long futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (fetch_robust_entry(&entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * don't process it twice: + */ + if (entry != pending) { + if (handle_futex_death((void __user *)entry + futex_offset, + curr, pi, HANDLE_DEATH_LIST)) + return; + } + if (rc) + return; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + + if (pending) { + handle_futex_death((void __user *)pending + futex_offset, + curr, pip, HANDLE_DEATH_PENDING); + } +} + +#ifdef CONFIG_COMPAT +static void __user *futex_uaddr(struct robust_list __user *entry, + compat_long_t futex_offset) +{ + compat_uptr_t base = ptr_to_compat(entry); + void __user *uaddr = compat_ptr(base + futex_offset); + + return uaddr; +} + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t __user *head, unsigned int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +static void compat_exit_robust_list(struct task_struct *curr) +{ + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int next_pi; + compat_uptr_t uentry, next_uentry, upending; + compat_long_t futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (compat_fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = compat_fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) { + void __user *uaddr = futex_uaddr(entry, futex_offset); + + if (handle_futex_death(uaddr, curr, pi, + HANDLE_DEATH_LIST)) + return; + } + if (rc) + return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + if (pending) { + void __user *uaddr = futex_uaddr(pending, futex_offset); + + handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); + } +} +#endif + +#ifdef CONFIG_FUTEX_PI + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +static void exit_pi_state_list(struct task_struct *curr) +{ + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; + union futex_key key = FUTEX_KEY_INIT; + + if (!futex_cmpxchg_enabled) + return; + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselves: + */ + raw_spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + hb = futex_hash(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!refcount_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } + raw_spin_unlock_irq(&curr->pi_lock); + + spin_lock(&hb->lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + put_pi_state(pi_state); + continue; + } + + WARN_ON(pi_state->owner != curr); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; + + raw_spin_unlock(&curr->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + + raw_spin_lock_irq(&curr->pi_lock); + } + raw_spin_unlock_irq(&curr->pi_lock); +} +#else +static inline void exit_pi_state_list(struct task_struct *curr) { } +#endif + +static void futex_cleanup(struct task_struct *tsk) +{ + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); + tsk->robust_list = NULL; + } + +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) { + compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } +#endif + + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); +} + +/** + * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD + * @tsk: task to set the state on + * + * Set the futex exit state of the task lockless. The futex waiter code + * observes that state when a task is exiting and loops until the task has + * actually finished the futex cleanup. The worst case for this is that the + * waiter runs through the wait loop until the state becomes visible. + * + * This is called from the recursive fault handling path in do_exit(). + * + * This is best effort. Either the futex exit code has run already or + * not. If the OWNER_DIED bit has been set on the futex then the waiter can + * take it over. If not, the problem is pushed back to user space. If the + * futex exit code did not run yet, then an already queued waiter might + * block forever, but there is nothing which can be done about that. + */ +void futex_exit_recursive(struct task_struct *tsk) +{ + /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ + if (tsk->futex_state == FUTEX_STATE_EXITING) + mutex_unlock(&tsk->futex_exit_mutex); + tsk->futex_state = FUTEX_STATE_DEAD; +} + +static void futex_cleanup_begin(struct task_struct *tsk) +{ + /* + * Prevent various race issues against a concurrent incoming waiter + * including live locks by forcing the waiter to block on + * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in + * attach_to_pi_owner(). + */ + mutex_lock(&tsk->futex_exit_mutex); + + /* + * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. + * + * This ensures that all subsequent checks of tsk->futex_state in + * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with + * tsk->pi_lock held. + * + * It guarantees also that a pi_state which was queued right before + * the state change under tsk->pi_lock by a concurrent waiter must + * be observed in exit_pi_state_list(). + */ + raw_spin_lock_irq(&tsk->pi_lock); + tsk->futex_state = FUTEX_STATE_EXITING; + raw_spin_unlock_irq(&tsk->pi_lock); +} + +static void futex_cleanup_end(struct task_struct *tsk, int state) +{ + /* + * Lockless store. The only side effect is that an observer might + * take another loop until it becomes visible. + */ + tsk->futex_state = state; + /* + * Drop the exit protection. This unblocks waiters which observed + * FUTEX_STATE_EXITING to reevaluate the state. + */ + mutex_unlock(&tsk->futex_exit_mutex); +} + +void futex_exec_release(struct task_struct *tsk) +{ + /* + * The state handling is done for consistency, but in the case of + * exec() there is no way to prevent further damage as the PID stays + * the same. But for the unlikely and arguably buggy case that a + * futex is held on exec(), this provides at least as much state + * consistency protection which is possible. + */ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + /* + * Reset the state to FUTEX_STATE_OK. The task is alive and about + * exec a new binary. + */ + futex_cleanup_end(tsk, FUTEX_STATE_OK); +} + +void futex_exit_release(struct task_struct *tsk) +{ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + futex_cleanup_end(tsk, FUTEX_STATE_DEAD); +} + +static void __init futex_detect_cmpxchg(void) +{ +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG + u32 curval; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non-functional ones will return + * -ENOSYS. + */ + if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT) + futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ + unsigned int futex_shift; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + + futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; + + futex_detect_cmpxchg(); + + for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); + plist_head_init(&futex_queues[i].chain); + spin_lock_init(&futex_queues[i].lock); + } + + return 0; +} +core_initcall(futex_init); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h new file mode 100644 index 000000000000..040ae4277cb0 --- /dev/null +++ b/kernel/futex/futex.h @@ -0,0 +1,299 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FUTEX_H +#define _FUTEX_H + +#include <linux/futex.h> +#include <linux/sched/wake_q.h> + +#ifdef CONFIG_PREEMPT_RT +#include <linux/rcuwait.h> +#endif + +#include <asm/futex.h> + +/* + * Futex flags used to encode options to functions and preserve them across + * restarts. + */ +#ifdef CONFIG_MMU +# define FLAGS_SHARED 0x01 +#else +/* + * NOMMU does not have per process address space. Let the compiler optimize + * code away. + */ +# define FLAGS_SHARED 0x00 +#endif +#define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 + +#ifdef CONFIG_HAVE_FUTEX_CMPXCHG +#define futex_cmpxchg_enabled 1 +#else +extern int __read_mostly futex_cmpxchg_enabled; +#endif + +#ifdef CONFIG_FAIL_FUTEX +extern bool should_fail_futex(bool fshared); +#else +static inline bool should_fail_futex(bool fshared) +{ + return false; +} +#endif + +/* + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task + * waiting on a futex. + */ +struct futex_hash_bucket { + atomic_t waiters; + spinlock_t lock; + struct plist_head chain; +} ____cacheline_aligned_in_smp; + +/* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex_base pi_mutex; + + struct task_struct *owner; + refcount_t refcount; + + union futex_key key; +} __randomize_layout; + +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @list: priority-sorted list of tasks waiting on this futex + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @pi_state: optional priority inheritance state + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * @requeue_state: State field for futex_requeue_pi() + * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) + * + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so + * we can wake only the relevant ones (hashed queues may be shared). + * + * A futex_q has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. + * The order of wakeup is always to make the first condition true, then + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See futex_unqueue_pi(). + */ +struct futex_q { + struct plist_node list; + + struct task_struct *task; + spinlock_t *lock_ptr; + union futex_key key; + struct futex_pi_state *pi_state; + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; + atomic_t requeue_state; +#ifdef CONFIG_PREEMPT_RT + struct rcuwait requeue_wait; +#endif +} __randomize_layout; + +extern const struct futex_q futex_q_init; + +enum futex_access { + FUTEX_READ, + FUTEX_WRITE +}; + +extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw); + +extern struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns); + +extern struct futex_hash_bucket *futex_hash(union futex_key *key); + +/** + * futex_match - Check whether two futex keys are equal + * @key1: Pointer to key1 + * @key2: Pointer to key2 + * + * Return 1 if two futex_keys are equal, 0 otherwise. + */ +static inline int futex_match(union futex_key *key1, union futex_key *key2) +{ + return (key1 && key2 + && key1->both.word == key2->both.word + && key1->both.ptr == key2->both.ptr + && key1->both.offset == key2->both.offset); +} + +extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb); +extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout); +extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); + +extern int fault_in_user_writeable(u32 __user *uaddr); +extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval); +extern int futex_get_value_locked(u32 *dest, u32 __user *from); +extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); + +extern void __futex_unqueue(struct futex_q *q); +extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); +extern int futex_unqueue(struct futex_q *q); + +/** + * futex_queue() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * futex_queue() is typically paired with exactly one call to futex_unqueue(). The + * exceptions involve the PI related operations, which may use futex_unqueue_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __futex_queue(q, hb); + spin_unlock(&hb->lock); +} + +extern void futex_unqueue_pi(struct futex_q *q); + +extern void wait_for_owner_exiting(int ret, struct task_struct *exiting); + +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_inc(&hb->waiters); + /* + * Full barrier (A), see the ordering comment above. + */ + smp_mb__after_atomic(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} + +static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + /* + * Full barrier (B), see the ordering comment above. + */ + smp_mb(); + return atomic_read(&hb->waiters); +#else + return 1; +#endif +} + +extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); +extern void futex_q_unlock(struct futex_hash_bucket *hb); + + +extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, + struct task_struct **exiting, + int set_waiters); + +extern int refill_pi_state_cache(void); +extern void get_pi_state(struct futex_pi_state *pi_state); +extern void put_pi_state(struct futex_pi_state *pi_state); +extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); + +/* + * Express the locking dependencies for lockdep: + */ +static inline void +double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + if (hb1 > hb2) + swap(hb1, hb2); + + spin_lock(&hb1->lock); + if (hb1 != hb2) + spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); +} + +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); +} + +/* syscalls */ + +extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 + val, ktime_t *abs_time, u32 bitset, u32 __user + *uaddr2); + +extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_requeue, + u32 *cmpval, int requeue_pi); + +extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset); + +/** + * struct futex_vector - Auxiliary struct for futex_waitv() + * @w: Userspace provided data + * @q: Kernel side data + * + * Struct used to build an array with all data need for futex_waitv() + */ +struct futex_vector { + struct futex_waitv w; + struct futex_q q; +}; + +extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to); + +extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); + +extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); + +extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); + +extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); + +#endif /* _FUTEX_H */ diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c new file mode 100644 index 000000000000..183b28c32c83 --- /dev/null +++ b/kernel/futex/pi.c @@ -0,0 +1,1233 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/slab.h> +#include <linux/sched/task.h> + +#include "futex.h" +#include "../locking/rtmutex_common.h" + +/* + * PI code: + */ +int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + refcount_set(&pi_state->refcount, 1); + pi_state->key = FUTEX_KEY_INIT; + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state *alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void pi_state_update_owner(struct futex_pi_state *pi_state, + struct task_struct *new_owner) +{ + struct task_struct *old_owner = pi_state->owner; + + lockdep_assert_held(&pi_state->pi_mutex.wait_lock); + + if (old_owner) { + raw_spin_lock(&old_owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&old_owner->pi_lock); + } + + if (new_owner) { + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + } +} + +void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); +} + +/* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. + */ +void put_pi_state(struct futex_pi_state *pi_state) +{ + if (!pi_state) + return; + + if (!refcount_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + unsigned long flags; + + raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); + pi_state_update_owner(pi_state, NULL); + rt_mutex_proxy_unlock(&pi_state->pi_mutex); + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); + } + + if (current->pi_state_cache) { + kfree(pi_state); + } else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + refcount_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * We need to check the following states: + * + * Waiter | pi_state | pi->owner | uTID | uODIED | ? + * + * [1] NULL | --- | --- | 0 | 0/1 | Valid + * [2] NULL | --- | --- | >0 | 0/1 | Valid + * + * [3] Found | NULL | -- | Any | 0/1 | Invalid + * + * [4] Found | Found | NULL | 0 | 1 | Valid + * [5] Found | Found | NULL | >0 | 1 | Invalid + * + * [6] Found | Found | task | 0 | 1 | Valid + * + * [7] Found | Found | NULL | Any | 0 | Invalid + * + * [8] Found | Found | task | ==taskTID | 0/1 | Valid + * [9] Found | Found | task | 0 | 0 | Invalid + * [10] Found | Found | task | !=taskTID | 0/1 | Invalid + * + * [1] Indicates that the kernel can acquire the futex atomically. We + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2] Valid, if TID does not belong to a kernel thread. If no matching + * thread is found then it indicates that the owner TID has died. + * + * [3] Invalid. The waiter is queued on a non PI futex + * + * [4] Valid state after exit_robust_list(), which sets the user space + * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5] The user space value got manipulated between exit_robust_list() + * and exit_pi_state_list() + * + * [6] Valid state after exit_pi_state_list() which sets the new owner in + * the pi_state but cannot access the user space value. + * + * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8] Owner and user space value match + * + * [9] There is no transient state which sets the user space TID to 0 + * except exit_robust_list(), but this is indicated by the + * FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. Except one error case where the kernel is denied + * write access to the user address, see fixup_pi_state_owner(). + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * pi_mutex->owner -> pi_state->owner, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * + */ + +/* + * Validate that the existing waiter has a pi_state and sanity check + * the pi_state against the user space value. If correct, attach to + * it. + */ +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, + struct futex_pi_state **ps) +{ + pid_t pid = uval & FUTEX_TID_MASK; + u32 uval2; + int ret; + + /* + * Userspace might have messed up non-PI and PI futexes [3] + */ + if (unlikely(!pi_state)) + return -EINVAL; + + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ + WARN_ON(!refcount_read(&pi_state->refcount)); + + /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (futex_get_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + + /* + * Handle the owner died case: + */ + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and wakes the + * topmost waiter. The task which acquires the + * pi_state->rt_mutex will fixup owner. + */ + if (!pi_state->owner) { + /* + * No pi state owner, but the user space TID + * is not 0. Inconsistent state. [5] + */ + if (pid) + goto out_einval; + /* + * Take a ref on the state and return success. [4] + */ + goto out_attach; + } + + /* + * If TID is 0, then either the dying owner has not + * yet executed exit_pi_state_list() or some waiter + * acquired the rtmutex in the pi state, but did not + * yet fixup the TID in user space. + * + * Take a ref on the state and return success. [6] + */ + if (!pid) + goto out_attach; + } else { + /* + * If the owner died bit is not set, then the pi_state + * must have an owner. [7] + */ + if (!pi_state->owner) + goto out_einval; + } + + /* + * Bail out if user space manipulated the futex value. If pi + * state exists then the owner TID must be the same as the + * user space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + *ps = pi_state; + return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; +} + +static int handle_exit_race(u32 __user *uaddr, u32 uval, + struct task_struct *tsk) +{ + u32 uval2; + + /* + * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the + * caller that the alleged owner is busy. + */ + if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) + return -EBUSY; + + /* + * Reread the user space value to handle the following situation: + * + * CPU0 CPU1 + * + * sys_exit() sys_futex() + * do_exit() futex_lock_pi() + * futex_lock_pi_atomic() + * exit_signals(tsk) No waiters: + * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID + * mm_release(tsk) Set waiter bit + * exit_robust_list(tsk) { *uaddr = 0x80000PID; + * Set owner died attach_to_pi_owner() { + * *uaddr = 0xC0000000; tsk = get_task(PID); + * } if (!tsk->flags & PF_EXITING) { + * ... attach(); + * tsk->futex_state = } else { + * FUTEX_STATE_DEAD; if (tsk->futex_state != + * FUTEX_STATE_DEAD) + * return -EAGAIN; + * return -ESRCH; <--- FAIL + * } + * + * Returning ESRCH unconditionally is wrong here because the + * user space value has been changed by the exiting task. + * + * The same logic applies to the case where the exiting task is + * already gone. + */ + if (futex_get_value_locked(&uval2, uaddr)) + return -EFAULT; + + /* If the user space value has changed, try again. */ + if (uval2 != uval) + return -EAGAIN; + + /* + * The exiting task did not have a robust list, the robust list was + * corrupted or the user space value in *uaddr is simply bogus. + * Give up and tell user space. + */ + return -ESRCH; +} + +static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, + struct futex_pi_state **ps) +{ + /* + * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. + */ + struct futex_pi_state *pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make @p + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ + pi_state->owner = p; + + *ps = pi_state; +} +/* + * Lookup the task for the TID provided from user space and attach to + * it after doing proper sanity checks. + */ +static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, + struct futex_pi_state **ps, + struct task_struct **exiting) +{ + pid_t pid = uval & FUTEX_TID_MASK; + struct task_struct *p; + + /* + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when TID = 0 [1] + * + * The !pid check is paranoid. None of the call sites should end up + * with pid == 0, but better safe than sorry. Let the caller retry + */ + if (!pid) + return -EAGAIN; + p = find_get_task_by_vpid(pid); + if (!p) + return handle_exit_race(uaddr, uval, NULL); + + if (unlikely(p->flags & PF_KTHREAD)) { + put_task_struct(p); + return -EPERM; + } + + /* + * We need to look at the task state to figure out, whether the + * task is exiting. To protect against the change of the task state + * in futex_exit_release(), we do this protected by p->pi_lock: + */ + raw_spin_lock_irq(&p->pi_lock); + if (unlikely(p->futex_state != FUTEX_STATE_OK)) { + /* + * The task is on the way out. When the futex state is + * FUTEX_STATE_DEAD, we know that the task has finished + * the cleanup: + */ + int ret = handle_exit_race(uaddr, uval, p); + + raw_spin_unlock_irq(&p->pi_lock); + /* + * If the owner task is between FUTEX_STATE_EXITING and + * FUTEX_STATE_DEAD then store the task pointer and keep + * the reference on the task struct. The calling code will + * drop all locks, wait for the task to reach + * FUTEX_STATE_DEAD and then drop the refcount. This is + * required to prevent a live lock when the current task + * preempted the exiting task between the two states. + */ + if (ret == -EBUSY) + *exiting = p; + else + put_task_struct(p); + return ret; + } + + __attach_to_pi_owner(p, key, ps); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + return 0; +} + +static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +{ + int err; + u32 curval; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); + if (unlikely(err)) + return err; + + /* If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; +} + +/** + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Return: + * - 0 - ready to wait; + * - 1 - acquired the lock; + * - <0 - error + * + * The hb->lock must be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + */ +int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, + struct task_struct **exiting, + int set_waiters) +{ + u32 uval, newval, vpid = task_pid_vnr(task); + struct futex_q *top_waiter; + int ret; + + /* + * Read the user space value first so we can validate a few + * things before proceeding further. + */ + if (futex_get_value_locked(&uval, uaddr)) + return -EFAULT; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) + return -EDEADLK; + + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + + /* + * Lookup existing state first. If it exists, try to attach to + * its pi_state. + */ + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + + /* + * No waiter and user TID is 0. We are here because the + * waiters or the owner died bit is set or called from + * requeue_cmp_pi or for whatever reason something took the + * syscall. + */ + if (!(uval & FUTEX_TID_MASK)) { + /* + * We take over the futex. No other waiters and the user space + * TID is 0. We preserve the owner died bit. + */ + newval = uval & FUTEX_OWNER_DIED; + newval |= vpid; + + /* The futex requeue_pi code can enforce the waiters bit */ + if (set_waiters) + newval |= FUTEX_WAITERS; + + ret = lock_pi_update_atomic(uaddr, uval, newval); + if (ret) + return ret; + + /* + * If the waiter bit was requested the caller also needs PI + * state attached to the new owner of the user space futex. + * + * @task is guaranteed to be alive and it cannot be exiting + * because it is either sleeping or waiting in + * futex_requeue_pi_wakeup_sync(). + * + * No need to do the full attach_to_pi_owner() exercise + * because @task is known and valid. + */ + if (set_waiters) { + raw_spin_lock_irq(&task->pi_lock); + __attach_to_pi_owner(task, key, ps); + raw_spin_unlock_irq(&task->pi_lock); + } + return 1; + } + + /* + * First waiter. Set the waiters bit before attaching ourself to + * the owner. If owner tries to unlock, it will be forced into + * the kernel and blocked on hb->lock. + */ + newval = uval | FUTEX_WAITERS; + ret = lock_pi_update_atomic(uaddr, uval, newval); + if (ret) + return ret; + /* + * If the update of the user space value succeeded, we try to + * attach to the owner. If that fails, no harm done, we only + * set the FUTEX_WAITERS bit in the user space variable. + */ + return attach_to_pi_owner(uaddr, newval, key, ps, exiting); +} + +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) +{ + struct rt_mutex_waiter *top_waiter; + struct task_struct *new_owner; + bool postunlock = false; + DEFINE_RT_WAKE_Q(wqh); + u32 curval, newval; + int ret = 0; + + top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!top_waiter)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } + + new_owner = top_waiter->task; + + /* + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. + */ + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + + if (unlikely(should_fail_futex(true))) { + ret = -EFAULT; + goto out_unlock; + } + + ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); + if (!ret && (curval != uval)) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } + + if (!ret) { + /* + * This is a point of no return; once we modified the uval + * there is no going back and subsequent operations must + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); + } + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + + if (postunlock) + rt_mutex_postunlock(&wqh); + + return ret; +} + +static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner, *newowner; + u32 uval, curval, newval, newtid; + int err = 0; + + oldowner = pi_state->owner; + + /* + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), + * + * or: + * + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the pi_state + * because we can fault here. Imagine swapped out pages or a fork + * that marked all the anonymous memory readonly for cow. + * + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID checks when attaching to PI state . + */ +retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + return 0; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock. pi_state is correct. Tell caller. */ + return 1; + } + + /* + * The trylock just failed, so either there is an owner or + * there is a higher priority waiter than this one. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + /* + * If the higher priority waiter has not yet taken over the + * rtmutex then newowner is NULL. We can't return here with + * that state because it's inconsistent vs. the user space + * state. So drop the locks and try again. It's a valid + * situation and not any different from the other retry + * conditions. + */ + if (unlikely(!newowner)) { + err = -EAGAIN; + goto handle_err; + } + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + return 1; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + err = futex_get_value_locked(&uval, uaddr); + if (err) + goto handle_err; + + for (;;) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); + if (err) + goto handle_err; + + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ + pi_state_update_owner(pi_state, newowner); + + return argowner == current; + + /* + * In order to reschedule or handle a page fault, we need to drop the + * locks here. In the case of a fault, this gives the other task + * (either the highest priority waiter itself or the task which stole + * the rtmutex) the chance to try the fixup of the pi_state. So once we + * are back from handling the fault we need to check the pi_state after + * reacquiring the locks and before trying to do another fixup. When + * the fixup has been done already we simply return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. + */ +handle_err: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(q->lock_ptr); + + switch (err) { + case -EFAULT: + err = fault_in_user_writeable(uaddr); + break; + + case -EAGAIN: + cond_resched(); + err = 0; + break; + + default: + WARN_ON_ONCE(1); + break; + } + + spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return argowner == current; + + /* Retry if err was -EAGAIN or the fault in succeeded */ + if (!err) + goto retry; + + /* + * fault_in_user_writeable() failed so user state is immutable. At + * best we can make the kernel state consistent but user state will + * be most likely hosed and any subsequent unlock operation will be + * rejected due to PI futex rule [10]. + * + * Ensure that the rtmutex owner is also the pi_state owner despite + * the user space value claiming something different. There is no + * point in unlocking the rtmutex if current is the owner as it + * would need to wait until the next waiter has taken the rtmutex + * to guarantee consistent state. Keep it simple. Userspace asked + * for this wreckaged state. + * + * The rtmutex has an owner - either current or some other + * task. See the EAGAIN loop above. + */ + pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); + + return err; +} + +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + int ret; + + lockdep_assert_held(q->lock_ptr); + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + ret = __fixup_pi_state_owner(uaddr, q, argowner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; +} + +/** + * fixup_pi_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Return: + * - 1 - success, lock taken; + * - 0 - success, lock not taken; + * - <0 - on error (-EFAULT) + */ +int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) +{ + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + * + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. + */ + if (q->pi_state->owner != current) + return fixup_pi_state_owner(uaddr, q, current); + return 1; + } + + /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) + return fixup_pi_state_owner(uaddr, q, NULL); + + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner of the rt_mutex. Warn and establish consistent state. + */ + if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) + return fixup_pi_state_owner(uaddr, q, current); + + return 0; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. + */ +int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) +{ + struct hrtimer_sleeper timeout, *to; + struct task_struct *exiting = NULL; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int res, ret; + + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + + if (refill_pi_state_cache()) + return -ENOMEM; + + to = futex_setup_timer(time, &timeout, flags, 0); + +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); + if (unlikely(ret != 0)) + goto out; + +retry_private: + hb = futex_q_lock(&q); + + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); + if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ + switch (ret) { + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Task is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + futex_q_unlock(hb); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock_put_key; + } + } + + WARN_ON(!q.pi_state); + + /* + * Only actually queue now that the atomic ops are done: + */ + __futex_queue(&q, hb); + + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; + } + + rt_mutex_init_waiter(&rt_waiter); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; + goto cleanup; + } + + if (unlikely(to)) + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + +cleanup: + spin_lock(q.lock_ptr); + /* + * If we failed to acquire the lock (deadlock/signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait + * lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_pi_owner(uaddr, &q, !ret); + /* + * If fixup_pi_owner() returned an error, propagate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + futex_unqueue_pi(&q); + spin_unlock(q.lock_ptr); + goto out; + +out_unlock_put_key: + futex_q_unlock(hb); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret != -EINTR ? ret : -ERESTARTNOINTR; + +uaddr_faulted: + futex_q_unlock(hb); + + ret = fault_in_user_writeable(uaddr); + if (ret) + goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) +{ + u32 curval, uval, vpid = task_pid_vnr(current); + union futex_key key = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb; + struct futex_q *top_waiter; + int ret; + + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != vpid) + return -EPERM; + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); + if (ret) + return ret; + + hb = futex_hash(&key); + spin_lock(&hb->lock); + + /* + * Check waiters first. We do not trust user space values at + * all and we at least want to know if user space fiddled + * with the futex value instead of blindly unlocking. + */ + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_p() must observe a state consistent with what we + * observed. + * + * In particular; this forces __rt_mutex_start_proxy() to + * complete such that we're guaranteed to observe the + * rt_waiter. Also see the WARN in wake_futex_pi(). + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + /* drops pi_state->pi_mutex.wait_lock */ + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. + */ + if (!ret) + return ret; + /* + * The atomic access to the futex value generated a + * pagefault, so retry the user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) + goto pi_retry; + /* + * wake_futex_pi has detected invalid state. Tell user + * space. + */ + return ret; + } + + /* + * We have no kernel internal state, i.e. no waiters in the + * kernel. Waiters which are about to queue themselves are stuck + * on hb->lock. So we can safely ignore them. We do neither + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. + */ + if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { + spin_unlock(&hb->lock); + switch (ret) { + case -EFAULT: + goto pi_faulted; + + case -EAGAIN: + goto pi_retry; + + default: + WARN_ON_ONCE(1); + return ret; + } + } + + /* + * If uval has changed, let user space handle it. + */ + ret = (curval == uval) ? 0 : -EAGAIN; + +out_unlock: + spin_unlock(&hb->lock); + return ret; + +pi_retry: + cond_resched(); + goto retry; + +pi_faulted: + + ret = fault_in_user_writeable(uaddr); + if (!ret) + goto retry; + + return ret; +} + diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c new file mode 100644 index 000000000000..cba8b1a6a4cc --- /dev/null +++ b/kernel/futex/requeue.c @@ -0,0 +1,897 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/sched/signal.h> + +#include "futex.h" +#include "../locking/rtmutex_common.h" + +/* + * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an + * underlying rtmutex. The task which is about to be requeued could have + * just woken up (timeout, signal). After the wake up the task has to + * acquire hash bucket lock, which is held by the requeue code. As a task + * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking + * and the hash bucket lock blocking would collide and corrupt state. + * + * On !PREEMPT_RT this is not a problem and everything could be serialized + * on hash bucket lock, but aside of having the benefit of common code, + * this allows to avoid doing the requeue when the task is already on the + * way out and taking the hash bucket lock of the original uaddr1 when the + * requeue has been completed. + * + * The following state transitions are valid: + * + * On the waiter side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT + * + * On the requeue side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) + * + * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this + * signals that the waiter is already on the way out. It also means that + * the waiter is still on the 'wait' futex, i.e. uaddr1. + * + * The waiter side signals early wakeup to the requeue side either through + * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending + * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately + * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, + * which means the wakeup is interleaving with a requeue in progress it has + * to wait for the requeue side to change the state. Either to DONE/LOCKED + * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex + * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by + * the requeue side when the requeue attempt failed via deadlock detection + * and therefore the waiter q is still on the uaddr1 futex. + */ +enum { + Q_REQUEUE_PI_NONE = 0, + Q_REQUEUE_PI_IGNORE, + Q_REQUEUE_PI_IN_PROGRESS, + Q_REQUEUE_PI_WAIT, + Q_REQUEUE_PI_DONE, + Q_REQUEUE_PI_LOCKED, +}; + +const struct futex_q futex_q_init = { + /* list gets initialized in futex_queue()*/ + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY, + .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), +}; + +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + futex_hb_waiters_dec(hb1); + futex_hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; + } + q->key = *key2; +} + +static inline bool futex_requeue_pi_prepare(struct futex_q *q, + struct futex_pi_state *pi_state) +{ + int old, new; + + /* + * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has + * already set Q_REQUEUE_PI_IGNORE to signal that requeue should + * ignore the waiter. + */ + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return false; + + /* + * futex_proxy_trylock_atomic() might have set it to + * IN_PROGRESS and a interleaved early wake to WAIT. + * + * It was considered to have an extra state for that + * trylock, but that would just add more conditionals + * all over the place for a dubious value. + */ + if (old != Q_REQUEUE_PI_NONE) + break; + + new = Q_REQUEUE_PI_IN_PROGRESS; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + q->pi_state = pi_state; + return true; +} + +static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return; + + if (locked >= 0) { + /* Requeue succeeded. Set DONE or LOCKED */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && + old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_DONE + locked; + } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { + /* Deadlock, no early wakeup interleave */ + new = Q_REQUEUE_PI_NONE; + } else { + /* Deadlock, early wakeup interleave. */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_IGNORE; + } + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + +#ifdef CONFIG_PREEMPT_RT + /* If the waiter interleaved with the requeue let it know */ + if (unlikely(old == Q_REQUEUE_PI_WAIT)) + rcuwait_wake_up(&q->requeue_wait); +#endif +} + +static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + /* Is requeue done already? */ + if (old >= Q_REQUEUE_PI_DONE) + return old; + + /* + * If not done, then tell the requeue code to either ignore + * the waiter or to wake it up once the requeue is done. + */ + new = Q_REQUEUE_PI_WAIT; + if (old == Q_REQUEUE_PI_NONE) + new = Q_REQUEUE_PI_IGNORE; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + /* If the requeue was in progress, wait for it to complete */ + if (old == Q_REQUEUE_PI_IN_PROGRESS) { +#ifdef CONFIG_PREEMPT_RT + rcuwait_wait_event(&q->requeue_wait, + atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, + TASK_UNINTERRUPTIBLE); +#else + (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); +#endif + } + + /* + * Requeue is now either prohibited or complete. Reread state + * because during the wait above it might have changed. Nothing + * will modify q->requeue_state after this point. + */ + return atomic_read(&q->requeue_state); +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. + * + * 1) Set @q::key to the requeue target futex key so the waiter can detect + * the wakeup on the right futex. + * + * 2) Dequeue @q from the hash bucket. + * + * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock + * acquisition. + * + * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that + * the waiter has to fixup the pi state. + * + * 5) Complete the requeue state so the waiter can make progress. After + * this point the waiter task can return from the syscall immediately in + * case that the pi state does not have to be fixed up. + * + * 6) Wake the waiter task. + * + * Must be called with both q->lock_ptr and hb->lock held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) +{ + q->key = *key; + + __futex_unqueue(q); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + q->lock_ptr = &hb->lock; + + /* Signal locked state to the waiter */ + futex_requeue_pi_complete(q, 1); + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + * + * Return: + * - 0 - failed to acquire the lock atomically; + * - >0 - acquired the lock, return value is vpid of the top_waiter + * - <0 - error + */ +static int +futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key1, + union futex_key *key2, struct futex_pi_state **ps, + struct task_struct **exiting, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret; + + if (futex_get_value_locked(&curval, pifutex)) + return -EFAULT; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unnecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Ensure that this is a waiter sitting in futex_wait_requeue_pi() + * and waiting on the 'waitqueue' futex which is always !PI. + */ + if (!top_waiter->rt_waiter || top_waiter->pi_state) + return -EINVAL; + + /* Ensure we requeue to the expected futex. */ + if (!futex_match(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + + /* Ensure that this does not race against an early wakeup */ + if (!futex_requeue_pi_prepare(top_waiter, NULL)) + return -EAGAIN; + + /* + * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit + * in the contended case or if @set_waiters is true. + * + * In the contended case PI state is attached to the lock owner. If + * the user space lock can be acquired then PI state is attached to + * the new owner (@top_waiter->task) when @set_waiters is true. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + exiting, set_waiters); + if (ret == 1) { + /* + * Lock was acquired in user space and PI state was + * attached to @top_waiter->task. That means state is fully + * consistent and the waiter can return to user space + * immediately after the wakeup. + */ + requeue_pi_wake_futex(top_waiter, key2, hb2); + } else if (ret < 0) { + /* Rewind top_waiter::requeue_state */ + futex_requeue_pi_complete(top_waiter, ret); + } else { + /* + * futex_lock_pi_atomic() did not acquire the user space + * futex, but managed to establish the proxy lock and pi + * state. top_waiter::requeue_state cannot be fixed up here + * because the waiter is not enqueued on the rtmutex + * yet. This is handled at the callsite depending on the + * result of rt_mutex_start_proxy_lock() which is + * guaranteed to be reached with this function returning 0. + */ + } + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * @uaddr1: source futex user address + * @flags: futex flags (FLAGS_SHARED, etc.) + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Return: + * - >=0 - on success, the number of tasks requeued or woken; + * - <0 - on error + */ +int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + DEFINE_WAKE_Q(wake_q); + + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + + /* + * When PI not supported: return -ENOSYS if requeue_pi is true, + * consequently the compiler knows requeue_pi is always false past + * this point which will optimize away all the conditional code + * further down. + */ + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; + + if (requeue_pi) { + /* + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. + */ + if (uaddr1 == uaddr2) + return -EINVAL; + + /* + * futex_requeue() allows the caller to define the number + * of waiters to wake up via the @nr_wake argument. With + * REQUEUE_PI, waking up more than one waiter is creating + * more problems than it solves. Waking up a waiter makes + * only sense if the PI futex @uaddr2 is uncontended as + * this allows the requeue code to acquire the futex + * @uaddr2 before waking the waiter. The waiter can then + * return to user space without further action. A secondary + * wakeup would just make the futex_wait_requeue_pi() + * handling more complex, because that code would have to + * look up pi_state and do more or less all the handling + * which the requeue code has to do for the to be requeued + * waiters. So restrict the number of waiters to wake to + * one, and only wake it up when the PI futex is + * uncontended. Otherwise requeue it and let the unlock of + * the PI futex handle the wakeup. + * + * All REQUEUE_PI users, e.g. pthread_cond_signal() and + * pthread_cond_broadcast() must use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + } + +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? FUTEX_WRITE : FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (requeue_pi && futex_match(&key1, &key2)) + return -EINVAL; + + hb1 = futex_hash(&key1); + hb2 = futex_hash(&key2); + +retry_private: + futex_hb_waiters_inc(hb2); + double_lock_hb(hb1, hb2); + + if (likely(cmpval != NULL)) { + u32 curval; + + ret = futex_get_value_locked(&curval, uaddr1); + + if (unlikely(ret)) { + double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + + ret = get_user(curval, uaddr1); + if (ret) + return ret; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } + + if (requeue_pi) { + struct task_struct *exiting = NULL; + + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, + &exiting, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret == 1), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. + */ + switch (ret) { + case 0: + /* We hold a reference on the pi state. */ + break; + + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ + case -EFAULT: + double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + return ret; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Owner is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (task_count - nr_wake >= nr_requeue) + break; + + if (!futex_match(&this->key, &key1)) + continue; + + /* + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter) || + this->pi_state) { + ret = -EINVAL; + break; + } + + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + futex_wake_mark(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); + continue; + } + + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (!futex_match(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } + + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. + */ + get_pi_state(pi_state); + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { + /* + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. + */ + put_pi_state(pi_state); + continue; + } + + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + + if (ret == 1) { + /* + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. this::requeue_state is updated + * in the wakeup as well. + */ + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; + } + } + + /* + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. + */ + put_pi_state(pi_state); + +out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); + futex_hb_waiters_dec(hb2); + return ret ? ret : task_count; +} + +/** + * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @timeout: the timeout associated with the wait (NULL if none) + * + * Determine the cause for the early wakeup. + * + * Return: + * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + int ret; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + WARN_ON_ONCE(&hb->lock != q->lock_ptr); + + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + futex_hb_waiters_dec(hb); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initially wait on (non-pi) + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. + * + * We call schedule in futex_wait_queue() when we enqueue and return there + * via the following-- + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout + * + * If 3, cleanup and return -ERESTARTNOINTR. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Return: + * - 0 - On success; + * - <0 - On error + */ +int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + u32 val, ktime_t *abs_time, u32 bitset, + u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + struct rt_mutex_base *pi_mutex; + int res, ret; + + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + + if (uaddr == uaddr2) + return -EINVAL; + + if (!bitset) + return -EINVAL; + + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + rt_mutex_init_waiter(&rt_waiter); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) + goto out; + + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + + /* + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (futex_match(&q.key, &key2)) { + futex_q_unlock(hb); + ret = -EINVAL; + goto out; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue(hb, &q, to); + + switch (futex_requeue_pi_wakeup_sync(&q)) { + case Q_REQUEUE_PI_IGNORE: + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, to); + spin_unlock(&hb->lock); + break; + + case Q_REQUEUE_PI_LOCKED: + /* The requeue acquired the lock */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_owner(uaddr2, &q, true); + /* + * Drop the reference to the pi state which the + * requeue_pi() code acquired for us. + */ + put_pi_state(q.pi_state); + spin_unlock(q.lock_ptr); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. + */ + ret = ret < 0 ? ret : 0; + } + break; + + case Q_REQUEUE_PI_DONE: + /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + + /* Current is not longer pi_blocked_on */ + spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_pi_owner(uaddr2, &q, !ret); + /* + * If fixup_pi_owner() returned an error, propagate that. If it + * acquired the lock, clear -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + futex_unqueue_pi(&q); + spin_unlock(q.lock_ptr); + + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart + * by calling futex_lock_pi() directly. We could + * restart this syscall, but it would detect that + * the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart + * and return -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + break; + default: + BUG(); + } + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c new file mode 100644 index 000000000000..6f91a07a6a83 --- /dev/null +++ b/kernel/futex/syscalls.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <linux/time_namespace.h> + +#include "futex.h" + +/* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. + * + * Implementation: user-space maintains a per-thread list of locks it + * is holding. Upon do_exit(), the kernel carefully walks this list, + * and marks all locks that are owned by this thread with the + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is + * always manipulated with the lock held, so the list is private and + * per-thread. Userspace also maintains a per-thread 'list_op_pending' + * field, to allow the kernel to clean up if the thread dies after + * acquiring the lock, but just before it could have added itself to + * the list. There can only be one such pending lock. + */ + +/** + * sys_set_robust_list() - Set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects + */ +SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, + size_t, len) +{ + if (!futex_cmpxchg_enabled) + return -ENOSYS; + /* + * The kernel knows only one size for now: + */ + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->robust_list = head; + + return 0; +} + +/** + * sys_get_robust_list() - Get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size + */ +SYSCALL_DEFINE3(get_robust_list, int, pid, + struct robust_list_head __user * __user *, head_ptr, + size_t __user *, len_ptr) +{ + struct robust_list_head __user *head; + unsigned long ret; + struct task_struct *p; + + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + rcu_read_lock(); + + ret = -ESRCH; + if (!pid) + p = current; + else { + p = find_task_by_vpid(pid); + if (!p) + goto err_unlock; + } + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = p->robust_list; + rcu_read_unlock(); + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(head, head_ptr); + +err_unlock: + rcu_read_unlock(); + + return ret; +} + +long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3) +{ + int cmd = op & FUTEX_CMD_MASK; + unsigned int flags = 0; + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; + + if (op & FUTEX_CLOCK_REALTIME) { + flags |= FLAGS_CLOCKRT; + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && + cmd != FUTEX_LOCK_PI2) + return -ENOSYS; + } + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + if (!futex_cmpxchg_enabled) + return -ENOSYS; + } + + switch (cmd) { + case FUTEX_WAIT: + val3 = FUTEX_BITSET_MATCH_ANY; + fallthrough; + case FUTEX_WAIT_BITSET: + return futex_wait(uaddr, flags, val, timeout, val3); + case FUTEX_WAKE: + val3 = FUTEX_BITSET_MATCH_ANY; + fallthrough; + case FUTEX_WAKE_BITSET: + return futex_wake(uaddr, flags, val, val3); + case FUTEX_REQUEUE: + return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); + case FUTEX_CMP_REQUEUE: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); + case FUTEX_WAKE_OP: + return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); + case FUTEX_LOCK_PI: + flags |= FLAGS_CLOCKRT; + fallthrough; + case FUTEX_LOCK_PI2: + return futex_lock_pi(uaddr, flags, timeout, 0); + case FUTEX_UNLOCK_PI: + return futex_unlock_pi(uaddr, flags); + case FUTEX_TRYLOCK_PI: + return futex_lock_pi(uaddr, flags, NULL, 1); + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + } + return -ENOSYS; +} + +static __always_inline bool futex_cmd_has_timeout(u32 cmd) +{ + switch (cmd) { + case FUTEX_WAIT: + case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + return true; + } + return false; +} + +static __always_inline int +futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) +{ + if (!timespec64_valid(ts)) + return -EINVAL; + + *t = timespec64_to_ktime(*ts); + if (cmd == FUTEX_WAIT) + *t = ktime_add_safe(ktime_get(), *t); + else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) + *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); + return 0; +} + +SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + const struct __kernel_timespec __user *, utime, + u32 __user *, uaddr2, u32, val3) +{ + int ret, cmd = op & FUTEX_CMD_MASK; + ktime_t t, *tp = NULL; + struct timespec64 ts; + + if (utime && futex_cmd_has_timeout(cmd)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) + return -EFAULT; + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; + tp = &t; + } + + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); +} + +/* Mask of available flags for each futex in futex_waitv list */ +#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) + +/** + * futex_parse_waitv - Parse a waitv array from userspace + * @futexv: Kernel side list of waiters to be filled + * @uwaitv: Userspace list to be parsed + * @nr_futexes: Length of futexv + * + * Return: Error code on failure, 0 on success + */ +static int futex_parse_waitv(struct futex_vector *futexv, + struct futex_waitv __user *uwaitv, + unsigned int nr_futexes) +{ + struct futex_waitv aux; + unsigned int i; + + for (i = 0; i < nr_futexes; i++) { + if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) + return -EFAULT; + + if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) + return -EINVAL; + + if (!(aux.flags & FUTEX_32)) + return -EINVAL; + + futexv[i].w.flags = aux.flags; + futexv[i].w.val = aux.val; + futexv[i].w.uaddr = aux.uaddr; + futexv[i].q = futex_q_init; + } + + return 0; +} + +/** + * sys_futex_waitv - Wait on a list of futexes + * @waiters: List of futexes to wait on + * @nr_futexes: Length of futexv + * @flags: Flag for timeout (monotonic/realtime) + * @timeout: Optional absolute timeout. + * @clockid: Clock to be used for the timeout, realtime or monotonic. + * + * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes + * if a futex_wake() is performed at any uaddr. The syscall returns immediately + * if any waiter has *uaddr != val. *timeout is an optional timeout value for + * the operation. Each waiter has individual flags. The `flags` argument for + * the syscall should be used solely for specifying the timeout as realtime, if + * needed. Flags for private futexes, sizes, etc. should be used on the + * individual flags of each waiter. + * + * Returns the array index of one of the woken futexes. No further information + * is provided: any number of other futexes may also have been woken by the + * same event, and if more than one futex was woken, the retrned index may + * refer to any one of them. (It is not necessaryily the futex with the + * smallest index, nor the one most recently woken, nor...) + */ + +SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, + unsigned int, nr_futexes, unsigned int, flags, + struct __kernel_timespec __user *, timeout, clockid_t, clockid) +{ + struct hrtimer_sleeper to; + struct futex_vector *futexv; + struct timespec64 ts; + ktime_t time; + int ret; + + /* This syscall supports no flags for now */ + if (flags) + return -EINVAL; + + if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) + return -EINVAL; + + if (timeout) { + int flag_clkid = 0, flag_init = 0; + + if (clockid == CLOCK_REALTIME) { + flag_clkid = FLAGS_CLOCKRT; + flag_init = FUTEX_CLOCK_REALTIME; + } + + if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) + return -EINVAL; + + if (get_timespec64(&ts, timeout)) + return -EFAULT; + + /* + * Since there's no opcode for futex_waitv, use + * FUTEX_WAIT_BITSET that uses absolute timeout as well + */ + ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); + if (ret) + return ret; + + futex_setup_timer(&time, &to, flag_clkid, 0); + } + + futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return -ENOMEM; + + ret = futex_parse_waitv(futexv, waiters, nr_futexes); + if (!ret) + ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); + + if (timeout) { + hrtimer_cancel(&to.timer); + destroy_hrtimer_on_stack(&to.timer); + } + + kfree(futexv); + return ret; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(set_robust_list, + struct compat_robust_list_head __user *, head, + compat_size_t, len) +{ + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->compat_robust_list = head; + + return 0; +} + +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + compat_uptr_t __user *, head_ptr, + compat_size_t __user *, len_ptr) +{ + struct compat_robust_list_head __user *head; + unsigned long ret; + struct task_struct *p; + + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + rcu_read_lock(); + + ret = -ESRCH; + if (!pid) + p = current; + else { + p = find_task_by_vpid(pid); + if (!p) + goto err_unlock; + } + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = p->compat_robust_list; + rcu_read_unlock(); + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(ptr_to_compat(head), head_ptr); + +err_unlock: + rcu_read_unlock(); + + return ret; +} +#endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + u32, val3) +{ + int ret, cmd = op & FUTEX_CMD_MASK; + ktime_t t, *tp = NULL; + struct timespec64 ts; + + if (utime && futex_cmd_has_timeout(cmd)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; + tp = &t; + } + + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); +} +#endif /* CONFIG_COMPAT_32BIT_TIME */ + diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c new file mode 100644 index 000000000000..4ce0923f1ce3 --- /dev/null +++ b/kernel/futex/waitwake.c @@ -0,0 +1,708 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/sched/task.h> +#include <linux/sched/signal.h> +#include <linux/freezer.h> + +#include "futex.h" + +/* + * READ this before attempting to hack on futexes! + * + * Basic futex operation and ordering guarantees + * ============================================= + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * if (queue_empty()) + * return; + * if (uval == val) + * lock(hash_bucket(futex)); + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * + * waiters++; (a) + * smp_mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `--------> smp_mb(); (B) + * if (uval == val) + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); if (waiters) + * lock(hash_bucket(futex)); + * else wake_waiters(futex); + * waiters--; (b) unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read through + * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write + * to futex and the waiters read (see futex_hb_waiters_pending()). + * + * This yields the following case (where X:=waiters, Y:=futex): + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + * + * Note that a new waiter is accounted for in (a) even when it is possible that + * the wait call can return error, in which case we backtrack from it in (b). + * Refer to the comment in futex_q_lock(). + * + * Similarly, in order to account for waiters being requeued on another + * address we always increment the waiters for the destination bucket before + * acquiring the lock. It then decrements them again after releasing it - + * the code that actually moves the futex(es) between hash buckets (requeue_futex) + * will do the additional required waiter count housekeeping. This is done for + * double_lock_hb() and double_unlock_hb(), respectively. + */ + +/* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. Callers + * must ensure to later call wake_up_q() for the actual + * wakeups to occur. + */ +void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) +{ + struct task_struct *p = q->task; + + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + + get_task_struct(p); + __futex_unqueue(q); + /* + * The waiting task can free the futex_q as soon as q->lock_ptr = NULL + * is written, without taking any locks. This is possible in the event + * of a spurious wakeup, for example. A memory barrier is required here + * to prevent the following store to lock_ptr from getting ahead of the + * plist_del in __futex_unqueue(). + */ + smp_store_release(&q->lock_ptr, NULL); + + /* + * Queue the task for later wakeup for after we've released + * the hb->lock. + */ + wake_q_add_safe(wake_q, p); +} + +/* + * Wake up waiters matching bitset queued on this futex (uaddr). + */ +int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + union futex_key key = FUTEX_KEY_INIT; + int ret; + DEFINE_WAKE_Q(wake_q); + + if (!bitset) + return -EINVAL; + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + + hb = futex_hash(&key); + + /* Make sure we really have tasks to wakeup */ + if (!futex_hb_waiters_pending(hb)) + return ret; + + spin_lock(&hb->lock); + + plist_for_each_entry_safe(this, next, &hb->chain, list) { + if (futex_match (&this->key, &key)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + break; + } + + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; + + futex_wake_mark(&wake_q, this); + if (++ret >= nr_wake) + break; + } + } + + spin_unlock(&hb->lock); + wake_up_q(&wake_q); + return ret; +} + +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +{ + unsigned int op = (encoded_op & 0x70000000) >> 28; + unsigned int cmp = (encoded_op & 0x0f000000) >> 24; + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); + int oldval, ret; + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { + if (oparg < 0 || oparg > 31) { + char comm[sizeof(current->comm)]; + /* + * kill this print and return -EINVAL when userspace + * is sane again + */ + pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", + get_task_comm(comm, current), oparg); + oparg &= 31; + } + oparg = 1 << oparg; + } + + pagefault_disable(); + ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + pagefault_enable(); + if (ret) + return ret; + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + return oldval == cmparg; + case FUTEX_OP_CMP_NE: + return oldval != cmparg; + case FUTEX_OP_CMP_LT: + return oldval < cmparg; + case FUTEX_OP_CMP_GE: + return oldval >= cmparg; + case FUTEX_OP_CMP_LE: + return oldval <= cmparg; + case FUTEX_OP_CMP_GT: + return oldval > cmparg; + default: + return -ENOSYS; + } +} + +/* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + int nr_wake, int nr_wake2, int op) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + int ret, op_ret; + DEFINE_WAKE_Q(wake_q); + +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) + return ret; + + hb1 = futex_hash(&key1); + hb2 = futex_hash(&key2); + +retry_private: + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { + double_unlock_hb(hb1, hb2); + + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ + ret = op_ret; + return ret; + } + + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + return ret; + } + + cond_resched(); + if (!(flags & FLAGS_SHARED)) + goto retry_private; + goto retry; + } + + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (futex_match (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + futex_wake_mark(&wake_q, this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + op_ret = 0; + plist_for_each_entry_safe(this, next, &hb2->chain, list) { + if (futex_match (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + futex_wake_mark(&wake_q, this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + +out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); + return ret; +} + +static long futex_wait_restart(struct restart_block *restart); + +/** + * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ +void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using smp_store_mb() and + * futex_queue() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + set_current_state(TASK_INTERRUPTIBLE); + futex_queue(q, hb); + + /* Arm the timer */ + if (timeout) + hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); + + /* + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + freezable_schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/** + * unqueue_multiple - Remove various futexes from their hash bucket + * @v: The list of futexes to unqueue + * @count: Number of futexes in the list + * + * Helper to unqueue a list of futexes. This can't fail. + * + * Return: + * - >=0 - Index of the last futex that was awoken; + * - -1 - No futex was awoken + */ +static int unqueue_multiple(struct futex_vector *v, int count) +{ + int ret = -1, i; + + for (i = 0; i < count; i++) { + if (!futex_unqueue(&v[i].q)) + ret = i; + } + + return ret; +} + +/** + * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes + * @vs: The futex list to wait on + * @count: The size of the list + * @woken: Index of the last woken futex, if any. Used to notify the + * caller that it can return this index to userspace (return parameter) + * + * Prepare multiple futexes in a single step and enqueue them. This may fail if + * the futex list is invalid or if any futex was already awoken. On success the + * task is ready to interruptible sleep. + * + * Return: + * - 1 - One of the futexes was woken by another thread + * - 0 - Success + * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL + */ +static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) +{ + struct futex_hash_bucket *hb; + bool retry = false; + int ret, i; + u32 uval; + + /* + * Enqueuing multiple futexes is tricky, because we need to enqueue + * each futex on the list before dealing with the next one to avoid + * deadlocking on the hash bucket. But, before enqueuing, we need to + * make sure that current->state is TASK_INTERRUPTIBLE, so we don't + * lose any wake events, which cannot be done before the get_futex_key + * of the next key, because it calls get_user_pages, which can sleep. + * Thus, we fetch the list of futexes keys in two steps, by first + * pinning all the memory keys in the futex key, and only then we read + * each key and queue the corresponding futex. + * + * Private futexes doesn't need to recalculate hash in retry, so skip + * get_futex_key() when retrying. + */ +retry: + for (i = 0; i < count; i++) { + if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) + continue; + + ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), + !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), + &vs[i].q.key, FUTEX_READ); + + if (unlikely(ret)) + return ret; + } + + set_current_state(TASK_INTERRUPTIBLE); + + for (i = 0; i < count; i++) { + u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; + struct futex_q *q = &vs[i].q; + u32 val = (u32)vs[i].w.val; + + hb = futex_q_lock(q); + ret = futex_get_value_locked(&uval, uaddr); + + if (!ret && uval == val) { + /* + * The bucket lock can't be held while dealing with the + * next futex. Queue each futex at this moment so hb can + * be unlocked. + */ + futex_queue(q, hb); + continue; + } + + futex_q_unlock(hb); + __set_current_state(TASK_RUNNING); + + /* + * Even if something went wrong, if we find out that a futex + * was woken, we don't return error and return this index to + * userspace + */ + *woken = unqueue_multiple(vs, i); + if (*woken >= 0) + return 1; + + if (ret) { + /* + * If we need to handle a page fault, we need to do so + * without any lock and any enqueued futex (otherwise + * we could lose some wakeup). So we do it here, after + * undoing all the work done so far. In success, we + * retry all the work. + */ + if (get_user(uval, uaddr)) + return -EFAULT; + + retry = true; + goto retry; + } + + if (uval != val) + return -EWOULDBLOCK; + } + + return 0; +} + +/** + * futex_sleep_multiple - Check sleeping conditions and sleep + * @vs: List of futexes to wait for + * @count: Length of vs + * @to: Timeout + * + * Sleep if and only if the timeout hasn't expired and no futex on the list has + * been woken up. + */ +static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to) +{ + if (to && !to->task) + return; + + for (; count; count--, vs++) { + if (!READ_ONCE(vs->q.lock_ptr)) + return; + } + + freezable_schedule(); +} + +/** + * futex_wait_multiple - Prepare to wait on and enqueue several futexes + * @vs: The list of futexes to wait on + * @count: The number of objects + * @to: Timeout before giving up and returning to userspace + * + * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function + * sleeps on a group of futexes and returns on the first futex that is + * wake, or after the timeout has elapsed. + * + * Return: + * - >=0 - Hint to the futex that was awoken + * - <0 - On error + */ +int futex_wait_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to) +{ + int ret, hint = 0; + + if (to) + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); + + while (1) { + ret = futex_wait_multiple_setup(vs, count, &hint); + if (ret) { + if (ret > 0) { + /* A futex was woken during setup */ + ret = hint; + } + return ret; + } + + futex_sleep_multiple(vs, count, to); + + __set_current_state(TASK_RUNNING); + + ret = unqueue_multiple(vs, count); + if (ret >= 0) + return ret; + + if (to && !to->task) + return -ETIMEDOUT; + else if (signal_pending(current)) + return -ERESTARTSYS; + /* + * The final case is a spurious wakeup, for + * which just retry. + */ + } +} + +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @flags: futex flags (FLAGS_SHARED, etc.) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held on success, and unlocked on failure. + * + * Return: + * - 0 - uaddr contains val and hb has been locked; + * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + */ +int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; + + /* + * Access the page AFTER the hash-bucket is locked. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); + if (unlikely(ret != 0)) + return ret; + +retry_private: + *hb = futex_q_lock(q); + + ret = futex_get_value_locked(&uval, uaddr); + + if (ret) { + futex_q_unlock(*hb); + + ret = get_user(uval, uaddr); + if (ret) + return ret; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } + + if (uval != val) { + futex_q_unlock(*hb); + ret = -EWOULDBLOCK; + } + + return ret; +} + +int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) +{ + struct hrtimer_sleeper timeout, *to; + struct restart_block *restart; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int ret; + + if (!bitset) + return -EINVAL; + q.bitset = bitset; + + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); +retry: + /* + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out; + + /* futex_queue and wait for wakeup, timeout, or a signal. */ + futex_wait_queue(hb, &q, to); + + /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; + if (!futex_unqueue(&q)) + goto out; + ret = -ETIMEDOUT; + if (to && !to->task) + goto out; + + /* + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. + */ + if (!signal_pending(current)) + goto retry; + + ret = -ERESTARTSYS; + if (!abs_time) + goto out; + + restart = ¤t->restart_block; + restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = *abs_time; + restart->futex.bitset = bitset; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; + + ret = set_restart_fn(restart, futex_wait_restart); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + +static long futex_wait_restart(struct restart_block *restart) +{ + u32 __user *uaddr = restart->futex.uaddr; + ktime_t t, *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t = restart->futex.time; + tp = &t; + } + restart->fn = do_no_restart_syscall; + + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); +} + diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index fbc54c2a7f23..10929eda9825 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -97,9 +97,6 @@ config GENERIC_MSI_IRQ_DOMAIN config IRQ_MSI_IOMMU bool -config HANDLE_DOMAIN_IRQ - bool - config IRQ_TIMINGS bool @@ -144,3 +141,10 @@ config GENERIC_IRQ_MULTI_HANDLER bool help Allow to specify the low level IRQ handler at run time. + +# Cavium Octeon is the last system to use this deprecated option +# Do not even think of enabling this on any new platform +config DEPRECATED_IRQ_CPU_ONOFFLINE + bool + depends on CAVIUM_OCTEON_SOC + default CAVIUM_OCTEON_SOC diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a98bcfc4be7b..f895265d7548 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1122,6 +1122,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) } EXPORT_SYMBOL_GPL(irq_modify_status); +#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE /** * irq_cpu_online - Invoke all irq_cpu_online functions. * @@ -1181,6 +1182,7 @@ void irq_cpu_offline(void) raw_spin_unlock_irqrestore(&desc->lock, flags); } } +#endif #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index cc7cdd26e23e..6f29bf4c8515 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -25,6 +25,7 @@ static DEFINE_RAW_SPINLOCK(gc_lock); void irq_gc_noop(struct irq_data *d) { } +EXPORT_SYMBOL_GPL(irq_gc_noop); /** * irq_gc_mask_disable_reg - Mask chip via disable register @@ -44,6 +45,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg); /** * irq_gc_mask_set_bit - Mask chip via setting bit in mask register @@ -103,6 +105,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) *ct->mask_cache |= mask; irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg); /** * irq_gc_ack_set_bit - Ack pending interrupt via setting bit diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 221d80c31e94..27182003b879 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -14,6 +14,8 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <asm/irq_regs.h> + #include <trace/events/irq.h> #include "internals.h" @@ -226,4 +228,20 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) handle_arch_irq = handle_irq; return 0; } + +/** + * generic_handle_arch_irq - root irq handler for architectures which do no + * entry accounting themselves + * @regs: Register file coming from the low-level handling code + */ +asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + + irq_enter(); + old_regs = set_irq_regs(regs); + handle_arch_irq(regs); + set_irq_regs(old_regs); + irq_exit(); +} #endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4e3c29bb603c..2267e6527db3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -646,13 +646,16 @@ int handle_irq_desc(struct irq_desc *desc) generic_handle_irq_desc(desc); return 0; } -EXPORT_SYMBOL_GPL(handle_irq_desc); /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle * - */ + * Returns: 0 on success, or -EINVAL if conversion has failed + * + * This function must be called from an IRQ context with irq regs + * initialized. + */ int generic_handle_irq(unsigned int irq) { return handle_irq_desc(irq_to_desc(irq)); @@ -662,89 +665,39 @@ EXPORT_SYMBOL_GPL(generic_handle_irq); #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging - * to a domain, usually for a non-root interrupt - * controller + * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * + * This function must be called from an IRQ context with irq regs + * initialized. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { + WARN_ON_ONCE(!in_irq()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); -#ifdef CONFIG_HANDLE_DOMAIN_IRQ /** - * handle_domain_irq - Invoke the handler for a HW irq belonging to a domain, - * usually for a root interrupt controller + * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging + * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one - * @regs: Register file coming from the low-level handling code * * Returns: 0 on success, or -EINVAL if conversion has failed - */ -int handle_domain_irq(struct irq_domain *domain, - unsigned int hwirq, struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc *desc; - int ret = 0; - - irq_enter(); - - /* The irqdomain code provides boundary checks */ - desc = irq_resolve_mapping(domain, hwirq); - if (likely(desc)) - handle_irq_desc(desc); - else - ret = -EINVAL; - - irq_exit(); - set_irq_regs(old_regs); - return ret; -} - -/** - * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain - * @domain: The domain where to perform the lookup - * @hwirq: The HW irq number to convert to a logical one - * @regs: Register file coming from the low-level handling code * - * This function must be called from an NMI context. - * - * Returns: 0 on success, or -EINVAL if conversion has failed - */ -int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, - struct pt_regs *regs) + * This function must be called from an NMI context with irq regs + * initialized. + **/ +int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) { - struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc *desc; - int ret = 0; - - /* - * NMI context needs to be setup earlier in order to deal with tracing. - */ - WARN_ON(!in_nmi()); - - desc = irq_resolve_mapping(domain, hwirq); - - /* - * ack_bad_irq is not NMI-safe, just report - * an invalid interrupt. - */ - if (likely(desc)) - handle_irq_desc(desc); - else - ret = -EINVAL; - - set_irq_regs(old_regs); - return ret; + WARN_ON_ONCE(!in_nmi()); + return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } #endif -#endif /* Dynamic interrupt handling */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 27667e82ecc9..7405e384e5ed 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1259,6 +1259,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + sched_set_fifo(current); + if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; @@ -1424,8 +1426,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) if (IS_ERR(t)) return PTR_ERR(t); - sched_set_fifo(t); - /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code @@ -2827,7 +2827,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * - * This function should be called with preemption disabled if the + * This function should be called with migration disabled if the * interrupt controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c481d8458325..02b2daf07441 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); static int __init irqfixup_setup(char *str) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + pr_warn("irqfixup boot option not supported with PREEMPT_RT\n"); + return 1; + } irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644); static int __init irqpoll_setup(char *str) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + pr_warn("irqpoll boot option not supported with PREEMPT_RT\n"); + return 1; + } irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf1c00c881e4..7096384dc60f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -888,7 +888,7 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - hlist_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -4671,7 +4671,7 @@ print_lock_invalid_wait_context(struct task_struct *curr, /* * Verify the wait_type context. * - * This check validates we takes locks in the right wait-type order; that is it + * This check validates we take locks in the right wait-type order; that is it * ensures that we do not take mutexes inside spinlocks and do not attempt to * acquire spinlocks inside raw_spinlocks and the sort. * @@ -5366,7 +5366,7 @@ int __lock_is_held(const struct lockdep_map *lock, int read) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - if (read == -1 || hlock->read == read) + if (read == -1 || !!hlock->read == read) return LOCK_STATE_HELD; return LOCK_STATE_NOT_HELD; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d456579d0952..db1913611192 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -94,6 +94,9 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } +/* + * Returns: __mutex_owner(lock) on failure or NULL on success. + */ static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff) { unsigned long owner, curr = (unsigned long)current; @@ -348,13 +351,16 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, { bool ret = true; - rcu_read_lock(); + lockdep_assert_preemption_disabled(); + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * checking lock->owner still matches owner. And we already + * disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the + * task_strcut structure won't go away during the spinning + * period */ barrier(); @@ -374,7 +380,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, cpu_relax(); } - rcu_read_unlock(); return ret; } @@ -387,19 +392,25 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + lockdep_assert_preemption_disabled(); + if (need_resched()) return 0; - rcu_read_lock(); + /* + * We already disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the task_strcut + * structure won't go away during the spinning period. + */ owner = __mutex_owner(lock); /* * As lock holder preemption issue, we both skip spinning if task is not * on cpu or its cpu is preempted */ + if (owner) retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); - rcu_read_unlock(); /* * If lock->owner is not set, the mutex has been released. Return true @@ -736,6 +747,44 @@ __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } +/** + * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context + * @ww: mutex to lock + * @ww_ctx: optional w/w acquire context + * + * Trylocks a mutex with the optional acquire context; no deadlock detection is + * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise. + * + * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is + * specified, -EALREADY handling may happen in calls to ww_mutex_trylock. + * + * A mutex acquired with this function must be released with ww_mutex_unlock. + */ +int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx) + return mutex_trylock(&ww->base); + + MUTEX_WARN_ON(ww->base.magic != &ww->base); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__mutex_trylock(&ww->base)) { + ww_mutex_set_context_fastpath(ww, ww_ctx); + mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; +} +EXPORT_SYMBOL(ww_mutex_trylock); + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6bb116c559b4..0c6a48dfcecb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -446,19 +446,26 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) } /* RT mutex specific wake_q wrappers */ -static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, - struct rt_mutex_waiter *w) +static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh, + struct task_struct *task, + unsigned int wake_state) { - if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) { + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) { if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(wqh->rtlock_task); - get_task_struct(w->task); - wqh->rtlock_task = w->task; + get_task_struct(task); + wqh->rtlock_task = task; } else { - wake_q_add(&wqh->head, w->task); + wake_q_add(&wqh->head, task); } } +static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, + struct rt_mutex_waiter *w) +{ + rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state); +} + static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) { if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 88191f6e252c..6fd3162e4098 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -59,8 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) * set. */ for (r = atomic_read(&rwb->readers); r < 0;) { - /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */ - if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) + if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1))) return 1; } return 0; @@ -148,6 +147,7 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, { struct rt_mutex_base *rtm = &rwb->rtmutex; struct task_struct *owner; + DEFINE_RT_WAKE_Q(wqh); raw_spin_lock_irq(&rtm->wait_lock); /* @@ -158,9 +158,12 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, */ owner = rt_mutex_owner(rtm); if (owner) - wake_up_state(owner, state); + rt_mutex_wake_q_add_task(&wqh, owner, state); + /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */ + preempt_disable(); raw_spin_unlock_irq(&rtm->wait_lock); + rt_mutex_wake_up_q(&wqh); } static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, @@ -183,7 +186,7 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, /* * _release() is needed in case that reader is in fast path, pairing - * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE + * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock(). */ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 000e8d5a2884..c51387a43265 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -56,7 +56,6 @@ * * A fast path reader optimistic lock stealing is supported when the rwsem * is previously owned by a writer and the following conditions are met: - * - OSQ is empty * - rwsem is not currently writer owned * - the handoff isn't set. */ @@ -485,7 +484,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, /* * Limit # of readers that can be woken up per wakeup call. */ - if (woken >= MAX_READERS_WAKEUP) + if (unlikely(woken >= MAX_READERS_WAKEUP)) break; } @@ -577,6 +576,24 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, return true; } +/* + * The rwsem_spin_on_owner() function returns the following 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. + */ +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * Try to acquire write lock before the writer has been put on wait queue. @@ -617,7 +634,10 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) } preempt_disable(); - rcu_read_lock(); + /* + * Disable preemption is equal to the RCU read-side crital section, + * thus the task_strcut structure won't go away. + */ owner = rwsem_owner_flags(sem, &flags); /* * Don't check the read-owner as the entry may be stale. @@ -625,30 +645,12 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) if ((flags & RWSEM_NONSPINNABLE) || (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; - rcu_read_unlock(); preempt_enable(); lockevent_cond_inc(rwsem_opt_fail, !ret); return ret; } -/* - * The rwsem_spin_on_owner() function returns the following 4 values - * depending on the lock owner state. - * OWNER_NULL : owner is currently NULL - * OWNER_WRITER: when owner changes and is a writer - * OWNER_READER: when owner changes and the new owner may be a reader. - * OWNER_NONSPINNABLE: - * when optimistic spinning has to stop because either the - * owner stops running, is unknown, or its timeslice has - * been used up. - */ -enum owner_state { - OWNER_NULL = 1 << 0, - OWNER_WRITER = 1 << 1, - OWNER_READER = 1 << 2, - OWNER_NONSPINNABLE = 1 << 3, -}; #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) static inline enum owner_state @@ -670,12 +672,13 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) unsigned long flags, new_flags; enum owner_state state; + lockdep_assert_preemption_disabled(); + owner = rwsem_owner_flags(sem, &flags); state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; - rcu_read_lock(); for (;;) { /* * When a waiting writer set the handoff flag, it may spin @@ -693,7 +696,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * our spinning context already disabled preemption which is + * equal to RCU read-side crital section ensures the memory + * stays valid. */ barrier(); @@ -704,7 +709,6 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) cpu_relax(); } - rcu_read_unlock(); return state; } @@ -878,12 +882,11 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) static inline void clear_nonspinnable(struct rw_semaphore *sem) { } -static inline int +static inline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) { - return 0; + return OWNER_NONSPINNABLE; } -#define OWNER_NULL 1 #endif /* @@ -1095,9 +1098,16 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF && - rwsem_spin_on_owner(sem) == OWNER_NULL) - goto trylock_again; + if (wstate == WRITER_HANDOFF) { + enum owner_state owner_state; + + preempt_disable(); + owner_state = rwsem_spin_on_owner(sem); + preempt_enable(); + + if (owner_state == OWNER_NULL) + goto trylock_again; + } /* Block until there are no active lockers. */ for (;;) { diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index c5830cfa379a..b562f9289372 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -378,8 +378,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, local_irq_save(flags); preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, - do_raw_spin_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); return flags; } EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index d2912e44d61f..b2e553f9255b 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -24,6 +24,17 @@ #define RT_MUTEX_BUILD_SPINLOCKS #include "rtmutex.c" +/* + * __might_resched() skips the state check as rtlocks are state + * preserving. Take RCU nesting into account as spin/read/write_lock() can + * legitimately nest into an RCU read side critical section. + */ +#define RTLOCK_RESCHED_OFFSETS \ + (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) + +#define rtlock_might_resched() \ + __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) + static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) @@ -32,7 +43,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) static __always_inline void __rt_spin_lock(spinlock_t *lock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rtlock_lock(&lock->lock); rcu_read_lock(); migrate_disable(); @@ -210,7 +221,7 @@ EXPORT_SYMBOL(rt_write_trylock); void __sched rt_read_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); @@ -220,7 +231,7 @@ EXPORT_SYMBOL(rt_read_lock); void __sched rt_write_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 3e82f449b4ff..353004155d65 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -16,6 +16,15 @@ static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH +#define ww_acquire_init_noinject(a, b) do { \ + ww_acquire_init((a), (b)); \ + (a)->deadlock_inject_countdown = ~0U; \ + } while (0) +#else +#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b)) +#endif + struct test_mutex { struct work_struct work; struct ww_mutex mutex; @@ -36,7 +45,7 @@ static void test_mutex_work(struct work_struct *work) wait_for_completion(&mtx->go); if (mtx->flags & TEST_MTX_TRY) { - while (!ww_mutex_trylock(&mtx->mutex)) + while (!ww_mutex_trylock(&mtx->mutex, NULL)) cond_resched(); } else { ww_mutex_lock(&mtx->mutex, NULL); @@ -109,19 +118,39 @@ static int test_mutex(void) return 0; } -static int test_aa(void) +static int test_aa(bool trylock) { struct ww_mutex mutex; struct ww_acquire_ctx ctx; int ret; + const char *from = trylock ? "trylock" : "lock"; ww_mutex_init(&mutex, &ww_class); ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&mutex, &ctx); + if (!trylock) { + ret = ww_mutex_lock(&mutex, &ctx); + if (ret) { + pr_err("%s: initial lock failed!\n", __func__); + goto out; + } + } else { + ret = !ww_mutex_trylock(&mutex, &ctx); + if (ret) { + pr_err("%s: initial trylock failed!\n", __func__); + goto out; + } + } - if (ww_mutex_trylock(&mutex)) { - pr_err("%s: trylocked itself!\n", __func__); + if (ww_mutex_trylock(&mutex, NULL)) { + pr_err("%s: trylocked itself without context from %s!\n", __func__, from); + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + if (ww_mutex_trylock(&mutex, &ctx)) { + pr_err("%s: trylocked itself with context from %s!\n", __func__, from); ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; @@ -129,17 +158,17 @@ static int test_aa(void) ret = ww_mutex_lock(&mutex, &ctx); if (ret != -EALREADY) { - pr_err("%s: missed deadlock for recursing, ret=%d\n", - __func__, ret); + pr_err("%s: missed deadlock for recursing, ret=%d from %s\n", + __func__, ret, from); if (!ret) ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; } + ww_mutex_unlock(&mutex); ret = 0; out: - ww_mutex_unlock(&mutex); ww_acquire_fini(&ctx); return ret; } @@ -150,7 +179,7 @@ struct test_abba { struct ww_mutex b_mutex; struct completion a_ready; struct completion b_ready; - bool resolve; + bool resolve, trylock; int result; }; @@ -160,8 +189,13 @@ static void test_abba_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err; - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba->b_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!abba->trylock) + ww_mutex_lock(&abba->b_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx); complete(&abba->b_ready); wait_for_completion(&abba->a_ready); @@ -181,7 +215,7 @@ static void test_abba_work(struct work_struct *work) abba->result = err; } -static int test_abba(bool resolve) +static int test_abba(bool trylock, bool resolve) { struct test_abba abba; struct ww_acquire_ctx ctx; @@ -192,12 +226,18 @@ static int test_abba(bool resolve) INIT_WORK_ONSTACK(&abba.work, test_abba_work); init_completion(&abba.a_ready); init_completion(&abba.b_ready); + abba.trylock = trylock; abba.resolve = resolve; schedule_work(&abba.work); - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba.a_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!trylock) + ww_mutex_lock(&abba.a_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx); complete(&abba.a_ready); wait_for_completion(&abba.b_ready); @@ -249,7 +289,7 @@ static void test_cycle_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err, erra = 0; - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); complete(cycle->a_signal); @@ -581,7 +621,9 @@ static int stress(int nlocks, int nthreads, unsigned int flags) static int __init test_ww_mutex_init(void) { int ncpus = num_online_cpus(); - int ret; + int ret, i; + + printk(KERN_INFO "Beginning ww mutex selftests\n"); wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); if (!wq) @@ -591,17 +633,19 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = test_aa(); + ret = test_aa(false); if (ret) return ret; - ret = test_abba(false); + ret = test_aa(true); if (ret) return ret; - ret = test_abba(true); - if (ret) - return ret; + for (i = 0; i < 4; i++) { + ret = test_abba(i & 1, i & 2); + if (ret) + return ret; + } ret = test_cycle(ncpus); if (ret) @@ -619,6 +663,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; + printk(KERN_INFO "All ww mutex selftests passed\n"); return 0; } diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index 3f1fff7d2780..0e00205cf467 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -9,6 +9,31 @@ #define WW_RT #include "rtmutex.c" +int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct rt_mutex *rtm = &lock->base; + + if (!ww_ctx) + return rt_mutex_trylock(rtm); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__rt_mutex_trylock(&rtm->rtmutex)) { + ww_mutex_set_context_fastpath(lock, ww_ctx); + mutex_acquire_nest(&rtm->dep_map, 0, 1, ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; +} +EXPORT_SYMBOL(ww_mutex_trylock); + static int __sched __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, unsigned long ip) diff --git a/kernel/module.c b/kernel/module.c index 40ec9a030eec..5c26a76e800b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4489,8 +4489,10 @@ static void cfi_init(struct module *mod) /* Fix init/exit functions to point to the CFI jump table */ if (init) mod->init = *init; +#ifdef CONFIG_MODULE_UNLOAD if (exit) mod->exit = *exit; +#endif cfi_module_add(mod, module_addr_min); #endif diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c21b38cc25e9..690b0cec7459 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -247,7 +247,7 @@ struct lockdep_map rcu_lock_map = { .name = "rcu_read_lock", .key = &rcu_lock_key, .wait_type_outer = LD_WAIT_FREE, - .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */ + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT implies PREEMPT_RCU */ }; EXPORT_SYMBOL_GPL(rcu_lock_map); @@ -256,7 +256,7 @@ struct lockdep_map rcu_bh_lock_map = { .name = "rcu_read_lock_bh", .key = &rcu_bh_lock_key, .wait_type_outer = LD_WAIT_FREE, - .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */ + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ }; EXPORT_SYMBOL_GPL(rcu_bh_lock_map); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f2611b9cf503..523fd602ea90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -13,7 +13,7 @@ #include "sched.h" #include <linux/nospec.h> - +#include <linux/blkdev.h> #include <linux/kcov.h> #include <linux/scs.h> @@ -6300,7 +6300,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * make sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(tsk)) - blk_schedule_flush_plug(tsk); + blk_flush_plug(tsk->plug, true); } static void sched_update_worker(struct task_struct *tsk) @@ -8332,7 +8332,8 @@ int io_schedule_prepare(void) int old_iowait = current->in_iowait; current->in_iowait = 1; - blk_schedule_flush_plug(current); + if (current->plug) + blk_flush_plug(current->plug, true); return old_iowait; } @@ -8773,6 +8774,7 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } + scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } @@ -9448,14 +9450,8 @@ void __init sched_init(void) } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = preempt_count() + rcu_preempt_depth(); - - return (nested == preempt_offset); -} -void __might_sleep(const char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line) { unsigned int state = get_current_state(); /* @@ -9469,11 +9465,32 @@ void __might_sleep(const char *file, int line, int preempt_offset) (void *)current->task_state_change, (void *)current->task_state_change); - ___might_sleep(file, line, preempt_offset); + __might_resched(file, line, 0); } EXPORT_SYMBOL(__might_sleep); -void ___might_sleep(const char *file, int line, int preempt_offset) +static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) +{ + if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) + return; + + if (preempt_count() == preempt_offset) + return; + + pr_err("Preemption disabled at:"); + print_ip_sym(KERN_ERR, ip); +} + +static inline bool resched_offsets_ok(unsigned int offsets) +{ + unsigned int nested = preempt_count(); + + nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; + + return nested == offsets; +} + +void __might_resched(const char *file, int line, unsigned int offsets) { /* Ratelimiting timestamp: */ static unsigned long prev_jiffy; @@ -9483,7 +9500,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* WARN_ON_ONCE() by default, no rate limit required: */ rcu_sleep_check(); - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + if ((resched_offsets_ok(offsets) && !irqs_disabled() && !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) @@ -9496,29 +9513,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* Save this before calling printk(), since that will clobber it: */ preempt_disable_ip = get_preempt_disable_ip(current); - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->non_block_count, - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, + current->pid, current->comm); + pr_err("preempt_count: %x, expected: %x\n", preempt_count(), + offsets & MIGHT_RESCHED_PREEMPT_MASK); + + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + pr_err("RCU nest depth: %d, expected: %u\n", + rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); + } if (task_stack_end_corrupted(current)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + pr_emerg("Thread overran stack, or stack corrupted\n"); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && !preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(KERN_ERR, preempt_disable_ip); - } + + print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, + preempt_disable_ip); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -EXPORT_SYMBOL(___might_sleep); +EXPORT_SYMBOL(__might_resched); void __cant_sleep(const char *file, int line, int preempt_offset) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f0b249ec581d..56ff343d4119 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -37,7 +37,6 @@ #include <linux/binfmts.h> #include <linux/bitops.h> -#include <linux/blkdev.h> #include <linux/compat.h> #include <linux/context_tracking.h> #include <linux/cpufreq.h> diff --git a/kernel/signal.c b/kernel/signal.c index 952741f6d0f9..487bf4f5dadf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -426,22 +426,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, */ rcu_read_lock(); ucounts = task_ucounts(t); - sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); - switch (sigpending) { - case 1: - if (likely(get_ucounts(ucounts))) - break; - fallthrough; - case LONG_MAX: - /* - * we need to decrease the ucount in the userns tree on any - * failure to avoid counts leaking. - */ - dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); - rcu_read_unlock(); - return NULL; - } + sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); + if (!sigpending) + return NULL; if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); @@ -450,8 +438,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, } if (unlikely(q == NULL)) { - if (dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) - put_ucounts(ucounts); + dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); } else { INIT_LIST_HEAD(&q->list); q->flags = sigqueue_flags; @@ -464,8 +451,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) { - put_ucounts(q->ucounts); + if (q->ucounts) { + dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; } kmem_cache_free(sigqueue_cachep, q); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f43d89d92860..d1944258cfc0 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -143,13 +143,14 @@ COND_SYSCALL(capset); /* __ARCH_WANT_SYS_CLONE3 */ COND_SYSCALL(clone3); -/* kernel/futex.c */ +/* kernel/futex/syscalls.c */ COND_SYSCALL(futex); COND_SYSCALL(futex_time32); COND_SYSCALL(set_robust_list); COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); +COND_SYSCALL(futex_waitv); /* kernel/hrtimer.c */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fa91f398f28b..1183c88634aa 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -816,7 +816,7 @@ blk_trace_request_get_cgid(struct request *rq) * Records an action against a request. Will log the bio offset + size. * **/ -static void blk_add_trace_rq(struct request *rq, int error, +static void blk_add_trace_rq(struct request *rq, blk_status_t error, unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt; @@ -834,7 +834,8 @@ static void blk_add_trace_rq(struct request *rq, int error, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, error, 0, NULL, cgid); + rq->cmd_flags, what, blk_status_to_errno(error), 0, + NULL, cgid); rcu_read_unlock(); } @@ -863,7 +864,7 @@ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, - int error, unsigned int nr_bytes) + blk_status_t error, unsigned int nr_bytes) { blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, blk_trace_request_get_cgid(rq)); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7efbc8aaf7f6..feebf57c6458 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2208,7 +2208,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) } /** - * ftrace_update_record, set a record that now is tracing or not + * ftrace_update_record - set a record that now is tracing or not * @rec: the record to update * @enable: set to true if the record is tracing, false to force disable * @@ -2221,7 +2221,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable) } /** - * ftrace_test_record, check if the record has been enabled or not + * ftrace_test_record - check if the record has been enabled or not * @rec: the record to test * @enable: set to true to check if enabled, false if it is disabled * @@ -2574,7 +2574,7 @@ struct ftrace_rec_iter { }; /** - * ftrace_rec_iter_start, start up iterating over traced functions + * ftrace_rec_iter_start - start up iterating over traced functions * * Returns an iterator handle that is used to iterate over all * the records that represent address locations where functions @@ -2605,7 +2605,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void) } /** - * ftrace_rec_iter_next, get the next record to process. + * ftrace_rec_iter_next - get the next record to process. * @iter: The handle to the iterator. * * Returns the next iterator after the given iterator @iter. @@ -2630,7 +2630,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) } /** - * ftrace_rec_iter_record, get the record at the iterator location + * ftrace_rec_iter_record - get the record at the iterator location * @iter: The current iterator location * * Returns the record that the current @iter is at. @@ -2733,7 +2733,7 @@ static int __ftrace_modify_code(void *data) } /** - * ftrace_run_stop_machine, go back to the stop machine method + * ftrace_run_stop_machine - go back to the stop machine method * @command: The command to tell ftrace what to do * * If an arch needs to fall back to the stop machine method, the @@ -2745,7 +2745,7 @@ void ftrace_run_stop_machine(int command) } /** - * arch_ftrace_update_code, modify the code to trace or not trace + * arch_ftrace_update_code - modify the code to trace or not trace * @command: The command that needs to be done * * Archs can override this function if it does not need to @@ -6977,7 +6977,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START); if (bit < 0) return; @@ -7052,7 +7052,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START); if (bit < 0) return; @@ -7525,7 +7525,9 @@ void ftrace_kill(void) } /** - * Test if ftrace is dead or not. + * ftrace_is_dead - Test if ftrace is dead or not. + * + * Returns 1 if ftrace is "dead", zero otherwise. */ int ftrace_is_dead(void) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7896d30d90f7..bc677cd64224 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1744,16 +1744,15 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -/* - * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - * defined(CONFIG_FSNOTIFY) - */ -#else +#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER) #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", 0644, d_tracer, \ &tr->max_latency, &tracing_max_lat_fops) +#else +#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif #ifdef CONFIG_TRACER_MAX_TRACE @@ -9473,9 +9472,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_maxlat_file(tr, d_tracer); -#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 3044b762cbd7..928867f527e7 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -119,10 +119,58 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_eprobe *ep = to_trace_eprobe(ev); + const char *slash; - return strcmp(trace_probe_name(&ep->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && - trace_probe_match_command_args(&ep->tp, argc, argv); + /* + * We match the following: + * event only - match all eprobes with event name + * system and event only - match all system/event probes + * + * The below has the above satisfied with more arguments: + * + * attached system/event - If the arg has the system and event + * the probe is attached to, match + * probes with the attachment. + * + * If any more args are given, then it requires a full match. + */ + + /* + * If system exists, but this probe is not part of that system + * do not match. + */ + if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0) + return false; + + /* Must match the event name */ + if (strcmp(trace_probe_name(&ep->tp), event) != 0) + return false; + + /* No arguments match all */ + if (argc < 1) + return true; + + /* First argument is the system/event the probe is attached to */ + + slash = strchr(argv[0], '/'); + if (!slash) + slash = strchr(argv[0], '.'); + if (!slash) + return false; + + if (strncmp(ep->event_system, argv[0], slash - argv[0])) + return false; + if (strcmp(ep->event_name, slash + 1)) + return false; + + argc--; + argv++; + + /* If there are no other args, then match */ + if (argc < 1) + return true; + + return trace_probe_match_command_args(&ep->tp, argc, argv); } static struct dyn_event_operations eprobe_dyn_event_ops = { @@ -632,6 +680,13 @@ static int disable_eprobe(struct trace_eprobe *ep, trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); + + /* Make sure nothing is using the edata or trigger */ + tracepoint_synchronize_unregister(); + + kfree(edata); + kfree(trigger); + return 0; } @@ -849,8 +904,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (IS_ERR(ep)) { ret = PTR_ERR(ep); - /* This must return -ENOMEM, else there is a bug */ - WARN_ON_ONCE(ret != -ENOMEM); + /* This must return -ENOMEM or missing event, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); ep = NULL; goto error; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a6061a69aa84..f01e442716e2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2506,7 +2506,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data, * events. However, for convenience, users are allowed to directly * specify an event field in an action, which will be automatically * converted into a variable on their behalf. - + * * If a user specifies a field on an event that isn't the event the * histogram currently being defined (the target event histogram), the * only way that can be accomplished is if a new hist trigger is diff --git a/kernel/ucount.c b/kernel/ucount.c index bb51849e6375..eb03f3c68375 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -284,6 +284,55 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) return (new == 0); } +static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, + struct ucounts *last, enum ucount_type type) +{ + struct ucounts *iter, *next; + for (iter = ucounts; iter != last; iter = next) { + long dec = atomic_long_add_return(-1, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + next = iter->ns->ucounts; + if (dec == 0) + put_ucounts(iter); + } +} + +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type) +{ + do_dec_rlimit_put_ucounts(ucounts, NULL, type); +} + +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) +{ + /* Caller must hold a reference to ucounts */ + struct ucounts *iter; + long dec, ret = 0; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long max = READ_ONCE(iter->ns->ucount_max[type]); + long new = atomic_long_add_return(1, &iter->ucount[type]); + if (new < 0 || new > max) + goto unwind; + if (iter == ucounts) + ret = new; + /* + * Grab an extra ucount reference for the caller when + * the rlimit count was previously 0. + */ + if (new != 1) + continue; + if (!get_ucounts(iter)) + goto dec_unwind; + } + return ret; +dec_unwind: + dec = atomic_long_add_return(-1, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); +unwind: + do_dec_rlimit_put_ucounts(ucounts, iter, type); + return 0; +} + bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) { struct ucounts *iter; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a6b4a2443d..1b3eb1e9531f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4830,8 +4830,16 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); show_pwq(pwq); + printk_deferred_exit(); + } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. @@ -4849,7 +4857,12 @@ void show_workqueue_state(void) raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; - + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%us workers=%d", @@ -4864,6 +4877,7 @@ void show_workqueue_state(void) first = false; } pr_cont("\n"); + printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2a9b6dcdac4f..40e4766bc541 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -458,7 +458,7 @@ config STACK_VALIDATION config VMLINUX_VALIDATION bool - depends on STACK_VALIDATION && DEBUG_ENTRY && !PARAVIRT + depends on STACK_VALIDATION && DEBUG_ENTRY default y config VMLINUX_MAP diff --git a/lib/Makefile b/lib/Makefile index 5efd1b435a37..a841be5244ac 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -351,7 +351,7 @@ obj-$(CONFIG_OBJAGG) += objagg.o obj-$(CONFIG_PLDMFW) += pldmfw/ # KUnit tests -CFLAGS_bitfield_kunit.o := $(call cc-option,-Wframe-larger-than=10240) +CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index a2f38e23004a..9f4262ee33a5 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -20,8 +20,8 @@ * * The worst case for in-place decompression is that the beginning of * the file is compressed extremely well, and the rest of the file is - * uncompressible. Thus, we must look for worst-case expansion when the - * compressor is encoding uncompressible data. + * incompressible. Thus, we must look for worst-case expansion when the + * compressor is encoding incompressible data. * * The structure of the .xz file in case of a compressed kernel is as follows. * Sizes (as bytes) of the fields are in parenthesis. @@ -58,7 +58,7 @@ * uncompressed size of the payload is in practice never less than the * payload size itself. The LZMA2 format would allow uncompressed size * to be less than the payload size, but no sane compressor creates such - * files. LZMA2 supports storing uncompressible data in uncompressed form, + * files. LZMA2 supports storing incompressible data in uncompressed form, * so there's never a need to create payloads whose uncompressed size is * smaller than the compressed size. * @@ -167,8 +167,8 @@ * memeq and memzero are not used much and any remotely sane implementation * is fast enough. memcpy/memmove speed matters in multi-call mode, but * the kernel image is decompressed in single-call mode, in which only - * memcpy speed can matter and only if there is a lot of uncompressible data - * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the + * memmove speed can matter and only if there is a lot of incompressible data + * (LZMA2 stores incompressible chunks in uncompressed form). Thus, the * functions below should just be kept small; it's probably not worth * optimizing for speed. */ diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 451543937524..53e7eb1dd76c 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -217,11 +217,12 @@ static void fprop_reflect_period_percpu(struct fprop_global *p, } /* Event of type pl happened */ -void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) +void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, + long nr) { fprop_reflect_period_percpu(p, pl); - percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&p->events, 1); + percpu_counter_add_batch(&pl->events, nr, PROP_BATCH); + percpu_counter_add(&p->events, nr); } void fprop_fraction_percpu(struct fprop_global *p, @@ -253,20 +254,29 @@ void fprop_fraction_percpu(struct fprop_global *p, } /* - * Like __fprop_inc_percpu() except that event is counted only if the given + * Like __fprop_add_percpu() except that event is counted only if the given * type has fraction smaller than @max_frac/FPROP_FRAC_BASE */ -void __fprop_inc_percpu_max(struct fprop_global *p, - struct fprop_local_percpu *pl, int max_frac) +void __fprop_add_percpu_max(struct fprop_global *p, + struct fprop_local_percpu *pl, int max_frac, long nr) { if (unlikely(max_frac < FPROP_FRAC_BASE)) { unsigned long numerator, denominator; + s64 tmp; fprop_fraction_percpu(p, pl, &numerator, &denominator); - if (numerator > - (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) + /* Adding 'nr' to fraction exceeds max_frac/FPROP_FRAC_BASE? */ + tmp = (u64)denominator * max_frac - + ((u64)numerator << FPROP_FRAC_SHIFT); + if (tmp < 0) { + /* Maximum fraction already exceeded? */ return; + } else if (tmp < nr * (FPROP_FRAC_BASE - max_frac)) { + /* Add just enough for the fraction to saturate */ + nr = div_u64(tmp + FPROP_FRAC_BASE - max_frac - 1, + FPROP_FRAC_BASE - max_frac); + } } - __fprop_inc_percpu(p, pl); + __fprop_add_percpu(p, pl, nr); } diff --git a/lib/kunit/executor_test.c b/lib/kunit/executor_test.c index cdbe54b16501..e14a18af573d 100644 --- a/lib/kunit/executor_test.c +++ b/lib/kunit/executor_test.c @@ -116,8 +116,8 @@ static void kfree_at_end(struct kunit *test, const void *to_free) /* kfree() handles NULL already, but avoid allocating a no-op cleanup. */ if (IS_ERR_OR_NULL(to_free)) return; - kunit_alloc_and_get_resource(test, NULL, kfree_res_free, GFP_KERNEL, - (void *)to_free); + kunit_alloc_resource(test, NULL, kfree_res_free, GFP_KERNEL, + (void *)to_free); } static struct kunit_suite *alloc_fake_suite(struct kunit *test, diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 161108e5d2fe..71652e1c397c 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -258,7 +258,7 @@ static void init_shared_classes(void) #define WWAF(x) ww_acquire_fini(x) #define WWL(x, c) ww_mutex_lock(x, c) -#define WWT(x) ww_mutex_trylock(x) +#define WWT(x) ww_mutex_trylock(x, NULL) #define WWL1(x) ww_mutex_lock(x, NULL) #define WWU(x) ww_mutex_unlock(x) diff --git a/lib/random32.c b/lib/random32.c index 4d0e05e471d7..a57a0e18819d 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -39,6 +39,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/bitops.h> +#include <linux/slab.h> #include <asm/unaligned.h> #include <trace/events/random.h> diff --git a/lib/sbitmap.c b/lib/sbitmap.c index b25db9be938a..2709ab825499 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -489,6 +489,57 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq) } EXPORT_SYMBOL_GPL(__sbitmap_queue_get); +unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, + unsigned int *offset) +{ + struct sbitmap *sb = &sbq->sb; + unsigned int hint, depth; + unsigned long index, nr; + int i; + + if (unlikely(sb->round_robin)) + return 0; + + depth = READ_ONCE(sb->depth); + hint = update_alloc_hint_before_get(sb, depth); + + index = SB_NR_TO_INDEX(sb, hint); + + for (i = 0; i < sb->map_nr; i++) { + struct sbitmap_word *map = &sb->map[index]; + unsigned long get_mask; + + sbitmap_deferred_clear(map); + if (map->word == (1UL << (map->depth - 1)) - 1) + continue; + + nr = find_first_zero_bit(&map->word, map->depth); + if (nr + nr_tags <= map->depth) { + atomic_long_t *ptr = (atomic_long_t *) &map->word; + int map_tags = min_t(int, nr_tags, map->depth); + unsigned long val, ret; + + get_mask = ((1UL << map_tags) - 1) << nr; + do { + val = READ_ONCE(map->word); + ret = atomic_long_cmpxchg(ptr, val, get_mask | val); + } while (ret != val); + get_mask = (get_mask & ~ret) >> nr; + if (get_mask) { + *offset = nr + (index << sb->shift); + update_alloc_hint_after_get(sb, depth, hint, + *offset + map_tags - 1); + return get_mask; + } + } + /* Jump to next index. */ + if (++index >= sb->map_nr) + index = 0; + } + + return 0; +} + int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth) { @@ -577,6 +628,46 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq) } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); +static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag) +{ + if (likely(!sb->round_robin && tag < sb->depth)) + data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag); +} + +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, + int *tags, int nr_tags) +{ + struct sbitmap *sb = &sbq->sb; + unsigned long *addr = NULL; + unsigned long mask = 0; + int i; + + smp_mb__before_atomic(); + for (i = 0; i < nr_tags; i++) { + const int tag = tags[i] - offset; + unsigned long *this_addr; + + /* since we're clearing a batch, skip the deferred map */ + this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; + if (!addr) { + addr = this_addr; + } else if (addr != this_addr) { + atomic_long_andnot(mask, (atomic_long_t *) addr); + mask = 0; + addr = this_addr; + } + mask |= (1UL << SB_NR_TO_BIT(sb, tag)); + } + + if (mask) + atomic_long_andnot(mask, (atomic_long_t *) addr); + + smp_mb__after_atomic(); + sbitmap_queue_wake_up(sbq); + sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), + tags[nr_tags - 1] - offset); +} + void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { @@ -601,9 +692,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, */ smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq); - - if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth)) - *per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr; + sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 5cb50245a878..adce22ac18d6 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -39,6 +39,19 @@ config XZ_DEC_SPARC default y select XZ_DEC_BCJ +config XZ_DEC_MICROLZMA + bool "MicroLZMA decoder" + default n + help + MicroLZMA is a header format variant where the first byte + of a raw LZMA stream (without the end of stream marker) has + been replaced with a bitwise-negation of the lc/lp/pb + properties byte. MicroLZMA was created to be used in EROFS + but can be used by other things too where wasting minimal + amount of space for headers is important. + + Unless you know that you need this, say N. + endif config XZ_DEC_BCJ diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 7a6781e3f47b..27ce34520e78 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -248,6 +248,10 @@ struct lzma2_dec { * before the first LZMA chunk. */ bool need_props; + +#ifdef XZ_DEC_MICROLZMA + bool pedantic_microlzma; +#endif }; struct xz_dec_lzma2 { @@ -387,7 +391,14 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, *left -= copy_size; - memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size); + /* + * If doing in-place decompression in single-call mode and the + * uncompressed size of the file is larger than the caller + * thought (i.e. it is invalid input!), the buffers below may + * overlap and cause undefined behavior with memcpy(). + * With valid inputs memcpy() would be fine here. + */ + memmove(dict->buf + dict->pos, b->in + b->in_pos, copy_size); dict->pos += copy_size; if (dict->full < dict->pos) @@ -397,7 +408,11 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, if (dict->pos == dict->end) dict->pos = 0; - memcpy(b->out + b->out_pos, b->in + b->in_pos, + /* + * Like above but for multi-call mode: use memmove() + * to avoid undefined behavior with invalid input. + */ + memmove(b->out + b->out_pos, b->in + b->in_pos, copy_size); } @@ -408,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, } } +#ifdef XZ_DEC_MICROLZMA +# define DICT_FLUSH_SUPPORTS_SKIPPING true +#else +# define DICT_FLUSH_SUPPORTS_SKIPPING false +#endif + /* * Flush pending data from dictionary to b->out. It is assumed that there is * enough space in b->out. This is guaranteed because caller uses dict_limit() @@ -421,8 +442,19 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) if (dict->pos == dict->end) dict->pos = 0; - memcpy(b->out + b->out_pos, dict->buf + dict->start, - copy_size); + /* + * These buffers cannot overlap even if doing in-place + * decompression because in multi-call mode dict->buf + * has been allocated by us in this file; it's not + * provided by the caller like in single-call mode. + * + * With MicroLZMA, b->out can be NULL to skip bytes that + * the caller doesn't need. This cannot be done with XZ + * because it would break BCJ filters. + */ + if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL) + memcpy(b->out + b->out_pos, dict->buf + dict->start, + copy_size); } dict->start = dict->pos; @@ -488,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc) * functions so that the compiler is supposed to be able to more easily avoid * an extra branch. In this particular version of the LZMA decoder, this * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 - * on x86). Using a non-splitted version results in nicer looking code too. + * on x86). Using a non-split version results in nicer looking code too. * * NOTE: This must return an int. Do not make it return a bool or the speed * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, @@ -774,6 +806,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s) s->lzma.rep1 = 0; s->lzma.rep2 = 0; s->lzma.rep3 = 0; + s->lzma.len = 0; /* * All probabilities are initialized to the same value. This hack @@ -1157,8 +1190,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) } } - s->lzma.len = 0; - s->lzma2.sequence = SEQ_CONTROL; s->lzma2.need_dict_reset = true; @@ -1174,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) kfree(s); } + +#ifdef XZ_DEC_MICROLZMA +/* This is a wrapper struct to have a nice struct name in the public API. */ +struct xz_dec_microlzma { + struct xz_dec_lzma2 s; +}; + +enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr, + struct xz_buf *b) +{ + struct xz_dec_lzma2 *s = &s_ptr->s; + + /* + * sequence is SEQ_PROPERTIES before the first input byte, + * SEQ_LZMA_PREPARE until a total of five bytes have been read, + * and SEQ_LZMA_RUN for the rest of the input stream. + */ + if (s->lzma2.sequence != SEQ_LZMA_RUN) { + if (s->lzma2.sequence == SEQ_PROPERTIES) { + /* One byte is needed for the props. */ + if (b->in_pos >= b->in_size) + return XZ_OK; + + /* + * Don't increment b->in_pos here. The same byte is + * also passed to rc_read_init() which will ignore it. + */ + if (!lzma_props(s, ~b->in[b->in_pos])) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_LZMA_PREPARE; + } + + /* + * xz_dec_microlzma_reset() doesn't validate the compressed + * size so we do it here. We have to limit the maximum size + * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice + * round number and much more than users of this code should + * ever need. + */ + if (s->lzma2.compressed < RC_INIT_BYTES + || s->lzma2.compressed > (3U << 30)) + return XZ_DATA_ERROR; + + if (!rc_read_init(&s->rc, b)) + return XZ_OK; + + s->lzma2.compressed -= RC_INIT_BYTES; + s->lzma2.sequence = SEQ_LZMA_RUN; + + dict_reset(&s->dict, b); + } + + /* This is to allow increasing b->out_size between calls. */ + if (DEC_IS_SINGLE(s->dict.mode)) + s->dict.end = b->out_size - b->out_pos; + + while (true) { + dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos, + s->lzma2.uncompressed)); + + if (!lzma2_lzma(s, b)) + return XZ_DATA_ERROR; + + s->lzma2.uncompressed -= dict_flush(&s->dict, b); + + if (s->lzma2.uncompressed == 0) { + if (s->lzma2.pedantic_microlzma) { + if (s->lzma2.compressed > 0 || s->lzma.len > 0 + || !rc_is_finished(&s->rc)) + return XZ_DATA_ERROR; + } + + return XZ_STREAM_END; + } + + if (b->out_pos == b->out_size) + return XZ_OK; + + if (b->in_pos == b->in_size + && s->temp.size < s->lzma2.compressed) + return XZ_OK; + } +} + +struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size) +{ + struct xz_dec_microlzma *s; + + /* Restrict dict_size to the same range as in the LZMA2 code. */ + if (dict_size < 4096 || dict_size > (3U << 30)) + return NULL; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->s.dict.mode = mode; + s->s.dict.size = dict_size; + + if (DEC_IS_MULTI(mode)) { + s->s.dict.end = dict_size; + + s->s.dict.buf = vmalloc(dict_size); + if (s->s.dict.buf == NULL) { + kfree(s); + return NULL; + } + } + + return s; +} + +void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size, + uint32_t uncomp_size, int uncomp_size_is_exact) +{ + /* + * comp_size is validated in xz_dec_microlzma_run(). + * uncomp_size can safely be anything. + */ + s->s.lzma2.compressed = comp_size; + s->s.lzma2.uncompressed = uncomp_size; + s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact; + + s->s.lzma2.sequence = SEQ_PROPERTIES; + s->s.temp.size = 0; +} + +void xz_dec_microlzma_end(struct xz_dec_microlzma *s) +{ + if (DEC_IS_MULTI(s->s.dict.mode)) + vfree(s->s.dict.buf); + + kfree(s); +} +#endif diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c index fea86deaaa01..683570b93a8c 100644 --- a/lib/xz/xz_dec_stream.c +++ b/lib/xz/xz_dec_stream.c @@ -402,12 +402,12 @@ static enum xz_ret dec_stream_header(struct xz_dec *s) * we will accept other check types too, but then the check won't * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. */ + if (s->temp.buf[HEADER_MAGIC_SIZE + 1] > XZ_CHECK_MAX) + return XZ_OPTIONS_ERROR; + s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; #ifdef XZ_DEC_ANY_CHECK - if (s->check_type > XZ_CHECK_MAX) - return XZ_OPTIONS_ERROR; - if (s->check_type > XZ_CHECK_CRC32) return XZ_UNSUPPORTED_CHECK; #else diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c index 32eb3c03aede..61098c67a413 100644 --- a/lib/xz/xz_dec_syms.c +++ b/lib/xz/xz_dec_syms.c @@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset); EXPORT_SYMBOL(xz_dec_run); EXPORT_SYMBOL(xz_dec_end); +#ifdef CONFIG_XZ_DEC_MICROLZMA +EXPORT_SYMBOL(xz_dec_microlzma_alloc); +EXPORT_SYMBOL(xz_dec_microlzma_reset); +EXPORT_SYMBOL(xz_dec_microlzma_run); +EXPORT_SYMBOL(xz_dec_microlzma_end); +#endif + MODULE_DESCRIPTION("XZ decompressor"); -MODULE_VERSION("1.0"); +MODULE_VERSION("1.1"); MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org> and Igor Pavlov"); /* diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 09360ebb510e..bf1e94ec7873 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -37,6 +37,9 @@ # ifdef CONFIG_XZ_DEC_SPARC # define XZ_DEC_SPARC # endif +# ifdef CONFIG_XZ_DEC_MICROLZMA +# define XZ_DEC_MICROLZMA +# endif # define memeq(a, b, size) (memcmp(a, b, size) == 0) # define memzero(buf, size) memset(buf, 0, size) # endif diff --git a/mm/Makefile b/mm/Makefile index fc60a40ce954..d6c0042e3aa0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -46,7 +46,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page-writeback.o \ + maccess.o page-writeback.o folio-compat.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 4a9d4e27d0d9..c878d995af06 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -2,8 +2,9 @@ #include <linux/wait.h> #include <linux/rbtree.h> -#include <linux/backing-dev.h> #include <linux/kthread.h> +#include <linux/backing-dev.h> +#include <linux/blk-cgroup.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -977,6 +978,22 @@ void bdi_put(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_put); +struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return I_BDEV(inode)->bd_disk->bdi; +#endif + return sb->s_bdi; +} +EXPORT_SYMBOL(inode_to_bdi); + const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) diff --git a/mm/compaction.c b/mm/compaction.c index bfc93da1c2c7..fbc60f964c38 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1022,7 +1022,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!TestClearPageLRU(page)) goto isolate_fail_put; - lruvec = mem_cgroup_page_lruvec(page); + lruvec = folio_lruvec(page_folio(page)); /* If we already hold the lock, we can skip some rechecking */ if (lruvec != locked) { @@ -1032,7 +1032,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, page_folio(page)); /* Try get exclusive access under lock */ if (!skip_updated) { diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index c938a9c34e6c..7008c3735e99 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test) r = damon_new_region(0, 22); damon_add_region(r, t); damon_split_regions_of(c, t, 2); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(42); r = damon_new_region(0, 220); damon_add_region(r, t); damon_split_regions_of(c, t, 4); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); } diff --git a/mm/filemap.c b/mm/filemap.c index dae481293b5d..5e206a429b57 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -30,7 +30,6 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> -#include <linux/blkdev.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> @@ -835,6 +834,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); */ void replace_page_cache_page(struct page *old, struct page *new) { + struct folio *fold = page_folio(old); + struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; @@ -848,7 +849,7 @@ void replace_page_cache_page(struct page *old, struct page *new) new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(old, new); + mem_cgroup_migrate(fold, fnew); xas_lock_irq(&xas); xas_store(&xas, new); @@ -870,26 +871,25 @@ void replace_page_cache_page(struct page *old, struct page *new) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -noinline int __add_to_page_cache_locked(struct page *page, - struct address_space *mapping, - pgoff_t offset, gfp_t gfp, - void **shadowp) +noinline int __filemap_add_folio(struct address_space *mapping, + struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { - XA_STATE(xas, &mapping->i_pages, offset); - int huge = PageHuge(page); + XA_STATE(xas, &mapping->i_pages, index); + int huge = folio_test_hugetlb(folio); int error; bool charged = false; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageSwapBacked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); mapping_set_update(&xas, mapping); - get_page(page); - page->mapping = mapping; - page->index = offset; + folio_get(folio); + folio->mapping = mapping; + folio->index = index; if (!huge) { - error = mem_cgroup_charge(page, NULL, gfp); + error = mem_cgroup_charge(folio, NULL, gfp); + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) goto error; charged = true; @@ -901,7 +901,7 @@ noinline int __add_to_page_cache_locked(struct page *page, unsigned int order = xa_get_order(xas.xa, xas.xa_index); void *entry, *old = NULL; - if (order > thp_order(page)) + if (order > folio_order(folio)) xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), order, gfp); xas_lock_irq(&xas); @@ -918,13 +918,13 @@ noinline int __add_to_page_cache_locked(struct page *page, *shadowp = old; /* entry may have been split before we acquired lock */ order = xa_get_order(xas.xa, xas.xa_index); - if (order > thp_order(page)) { + if (order > folio_order(folio)) { xas_split(&xas, old, order); xas_reset(&xas); } } - xas_store(&xas, page); + xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; @@ -932,7 +932,7 @@ noinline int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting */ if (!huge) - __inc_lruvec_page_state(page, NR_FILE_PAGES); + __lruvec_stat_add_folio(folio, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -940,19 +940,19 @@ unlock: if (xas_error(&xas)) { error = xas_error(&xas); if (charged) - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(folio); goto error; } - trace_mm_filemap_add_to_page_cache(page); + trace_mm_filemap_add_to_page_cache(&folio->page); return 0; error: - page->mapping = NULL; + folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - put_page(page); + folio_put(folio); return error; } -ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); +ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); /** * add_to_page_cache_locked - add a locked page to the pagecache @@ -969,59 +969,58 @@ ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - return __add_to_page_cache_locked(page, mapping, offset, + return __filemap_add_folio(mapping, page_folio(page), offset, gfp_mask, NULL); } EXPORT_SYMBOL(add_to_page_cache_locked); -int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) +int filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; - __SetPageLocked(page); - ret = __add_to_page_cache_locked(page, mapping, offset, - gfp_mask, &shadow); + __folio_set_locked(folio); + ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) - __ClearPageLocked(page); + __folio_clear_locked(folio); else { /* - * The page might have been evicted from cache only + * The folio might have been evicted from cache only * recently, in which case it should be activated like - * any other repeatedly accessed page. - * The exception is pages getting rewritten; evicting other + * any other repeatedly accessed folio. + * The exception is folios getting rewritten; evicting other * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - WARN_ON_ONCE(PageActive(page)); - if (!(gfp_mask & __GFP_WRITE) && shadow) - workingset_refault(page, shadow); - lru_cache_add(page); + WARN_ON_ONCE(folio_test_active(folio)); + if (!(gfp & __GFP_WRITE) && shadow) + workingset_refault(folio, shadow); + folio_add_lru(folio); } return ret; } -EXPORT_SYMBOL_GPL(add_to_page_cache_lru); +EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct page *__page_cache_alloc(gfp_t gfp) +struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) { int n; - struct page *page; + struct folio *folio; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - page = __alloc_pages_node(n, gfp, 0); - } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); + folio = __folio_alloc_node(gfp, order, n); + } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); - return page; + return folio; } - return alloc_pages(gfp, 0); + return folio_alloc(gfp, order); } -EXPORT_SYMBOL(__page_cache_alloc); +EXPORT_SYMBOL(filemap_alloc_folio); #endif /* @@ -1074,11 +1073,11 @@ EXPORT_SYMBOL(filemap_invalidate_unlock_two); */ #define PAGE_WAIT_TABLE_BITS 8 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) -static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; +static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; -static wait_queue_head_t *page_waitqueue(struct page *page) +static wait_queue_head_t *folio_waitqueue(struct folio *folio) { - return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)]; + return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } void __init pagecache_init(void) @@ -1086,7 +1085,7 @@ void __init pagecache_init(void) int i; for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) - init_waitqueue_head(&page_wait_table[i]); + init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); } @@ -1141,10 +1140,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->page->flags)) + if (test_bit(key->bit_nr, &key->folio->flags)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->page->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags)) return -1; flags |= WQ_FLAG_DONE; } @@ -1157,7 +1156,7 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, * * So update the flags atomically, and wake up the waiter * afterwards to avoid any races. This store-release pairs - * with the load-acquire in wait_on_page_bit_common(). + * with the load-acquire in folio_wait_bit_common(). */ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); @@ -1176,14 +1175,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, return (flags & WQ_FLAG_EXCLUSIVE) != 0; } -static void wake_up_page_bit(struct page *page, int bit_nr) +static void folio_wake_bit(struct folio *folio, int bit_nr) { - wait_queue_head_t *q = page_waitqueue(page); + wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; wait_queue_entry_t bookmark; - key.page = page; + key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; @@ -1218,7 +1217,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr) * page waiters. */ if (!waitqueue_active(q) || !key.page_match) { - ClearPageWaiters(page); + folio_clear_waiters(folio); /* * It's possible to miss clearing Waiters here, when we woke * our page waiters, but the hashed waitqueue has waiters for @@ -1230,19 +1229,19 @@ static void wake_up_page_bit(struct page *page, int bit_nr) spin_unlock_irqrestore(&q->lock, flags); } -static void wake_up_page(struct page *page, int bit) +static void folio_wake(struct folio *folio, int bit) { - if (!PageWaiters(page)) + if (!folio_test_waiters(folio)) return; - wake_up_page_bit(page, bit); + folio_wake_bit(folio, bit); } /* - * A choice of three behaviors for wait_on_page_bit_common(): + * A choice of three behaviors for folio_wait_bit_common(): */ enum behavior { EXCLUSIVE, /* Hold ref to page and take the bit when woken, like - * __lock_page() waiting on then setting PG_locked. + * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like * wait_on_page_writeback() waiting on PG_writeback. @@ -1253,16 +1252,16 @@ enum behavior { }; /* - * Attempt to check (or get) the page bit, and mark us done + * Attempt to check (or get) the folio flag, and mark us done * if successful. */ -static inline bool trylock_page_bit_common(struct page *page, int bit_nr, +static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &page->flags)) + if (test_and_set_bit(bit_nr, &folio->flags)) return false; - } else if (test_bit(bit_nr, &page->flags)) + } else if (test_bit(bit_nr, &folio->flags)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; @@ -1272,9 +1271,10 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, /* How many times do we accept lock stealing from under a waiter? */ int sysctl_page_lock_unfairness = 5; -static inline int wait_on_page_bit_common(wait_queue_head_t *q, - struct page *page, int bit_nr, int state, enum behavior behavior) +static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, + int state, enum behavior behavior) { + wait_queue_head_t *q = folio_waitqueue(folio); int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; @@ -1283,8 +1283,8 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, unsigned long pflags; if (bit_nr == PG_locked && - !PageUptodate(page) && PageWorkingset(page)) { - if (!PageSwapBacked(page)) { + !folio_test_uptodate(folio) && folio_test_workingset(folio)) { + if (!folio_test_swapbacked(folio)) { delayacct_thrashing_start(); delayacct = true; } @@ -1294,7 +1294,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, init_wait(wait); wait->func = wake_page_function; - wait_page.page = page; + wait_page.folio = folio; wait_page.bit_nr = bit_nr; repeat: @@ -1309,7 +1309,7 @@ repeat: * Do one last check whether we can get the * page bit synchronously. * - * Do the SetPageWaiters() marking before that + * Do the folio_set_waiters() marking before that * to let any waker we _just_ missed know they * need to wake us up (otherwise they'll never * even go to the slow case that looks at the @@ -1320,8 +1320,8 @@ repeat: * lock to avoid races. */ spin_lock_irq(&q->lock); - SetPageWaiters(page); - if (!trylock_page_bit_common(page, bit_nr, wait)) + folio_set_waiters(folio); + if (!folio_trylock_flag(folio, bit_nr, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); @@ -1331,10 +1331,10 @@ repeat: * see whether the page bit testing has already * been done by the wake function. * - * We can drop our reference to the page. + * We can drop our reference to the folio. */ if (behavior == DROP) - put_page(page); + folio_put(folio); /* * Note that until the "finish_wait()", or until @@ -1371,7 +1371,7 @@ repeat: * * And if that fails, we'll have to retry this all. */ - if (unlikely(test_and_set_bit(bit_nr, &page->flags))) + if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) goto repeat; wait->flags |= WQ_FLAG_DONE; @@ -1380,7 +1380,7 @@ repeat: /* * If a signal happened, this 'finish_wait()' may remove the last - * waiter from the wait-queues, but the PageWaiters bit will remain + * waiter from the wait-queues, but the folio waiters bit will remain * set. That's ok. The next wakeup will take care of it, and trying * to do it here would be difficult and prone to races. */ @@ -1411,19 +1411,17 @@ repeat: return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } -void wait_on_page_bit(struct page *page, int bit_nr) +void folio_wait_bit(struct folio *folio, int bit_nr) { - wait_queue_head_t *q = page_waitqueue(page); - wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); + folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } -EXPORT_SYMBOL(wait_on_page_bit); +EXPORT_SYMBOL(folio_wait_bit); -int wait_on_page_bit_killable(struct page *page, int bit_nr) +int folio_wait_bit_killable(struct folio *folio, int bit_nr) { - wait_queue_head_t *q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED); + return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); } -EXPORT_SYMBOL(wait_on_page_bit_killable); +EXPORT_SYMBOL(folio_wait_bit_killable); /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked @@ -1440,31 +1438,28 @@ EXPORT_SYMBOL(wait_on_page_bit_killable); */ int put_and_wait_on_page_locked(struct page *page, int state) { - wait_queue_head_t *q; - - page = compound_head(page); - q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, PG_locked, state, DROP); + return folio_wait_bit_common(page_folio(page), PG_locked, state, + DROP); } /** - * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue - * @page: Page defining the wait queue of interest + * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue + * @folio: Folio defining the wait queue of interest * @waiter: Waiter to add to the queue * - * Add an arbitrary @waiter to the wait queue for the nominated @page. + * Add an arbitrary @waiter to the wait queue for the nominated @folio. */ -void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) +void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) { - wait_queue_head_t *q = page_waitqueue(page); + wait_queue_head_t *q = folio_waitqueue(folio); unsigned long flags; spin_lock_irqsave(&q->lock, flags); __add_wait_queue_entry_tail(q, waiter); - SetPageWaiters(page); + folio_set_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL_GPL(add_page_wait_queue); +EXPORT_SYMBOL_GPL(folio_add_wait_queue); #ifndef clear_bit_unlock_is_negative_byte @@ -1490,124 +1485,116 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem #endif /** - * unlock_page - unlock a locked page - * @page: the page + * folio_unlock - Unlock a locked folio. + * @folio: The folio. * - * Unlocks the page and wakes up sleepers in wait_on_page_locked(). - * Also wakes sleepers in wait_on_page_writeback() because the wakeup - * mechanism between PageLocked pages and PageWriteback pages is shared. - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. + * Unlocks the folio and wakes up any thread sleeping on the page lock. * - * Note that this depends on PG_waiters being the sign bit in the byte - * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to - * clear the PG_locked bit and test PG_waiters at the same time fairly - * portably (architectures that do LL/SC can test any bit, while x86 can - * test the sign bit). + * Context: May be called from interrupt or process context. May not be + * called from NMI context. */ -void unlock_page(struct page *page) +void folio_unlock(struct folio *folio) { + /* Bit 7 allows x86 to check the byte's sign bit */ BUILD_BUG_ON(PG_waiters != 7); - page = compound_head(page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags)) - wake_up_page_bit(page, PG_locked); + BUILD_BUG_ON(PG_locked > 7); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0))) + folio_wake_bit(folio, PG_locked); } -EXPORT_SYMBOL(unlock_page); +EXPORT_SYMBOL(folio_unlock); /** - * end_page_private_2 - Clear PG_private_2 and release any waiters - * @page: The page + * folio_end_private_2 - Clear PG_private_2 and wake any waiters. + * @folio: The folio. * - * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for - * this. The page ref held for PG_private_2 being set is released. + * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for + * it. The folio reference held for PG_private_2 being set is released. * - * This is, for example, used when a netfs page is being written to a local - * disk cache, thereby allowing writes to the cache for the same page to be + * This is, for example, used when a netfs folio is being written to a local + * disk cache, thereby allowing writes to the cache for the same folio to be * serialised. */ -void end_page_private_2(struct page *page) +void folio_end_private_2(struct folio *folio) { - page = compound_head(page); - VM_BUG_ON_PAGE(!PagePrivate2(page), page); - clear_bit_unlock(PG_private_2, &page->flags); - wake_up_page_bit(page, PG_private_2); - put_page(page); + VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); + clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); + folio_wake_bit(folio, PG_private_2); + folio_put(folio); } -EXPORT_SYMBOL(end_page_private_2); +EXPORT_SYMBOL(folio_end_private_2); /** - * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page - * @page: The page to wait on + * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. + * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page. + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. */ -void wait_on_page_private_2(struct page *page) +void folio_wait_private_2(struct folio *folio) { - page = compound_head(page); - while (PagePrivate2(page)) - wait_on_page_bit(page, PG_private_2); + while (folio_test_private_2(folio)) + folio_wait_bit(folio, PG_private_2); } -EXPORT_SYMBOL(wait_on_page_private_2); +EXPORT_SYMBOL(folio_wait_private_2); /** - * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page - * @page: The page to wait on + * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. + * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a * fatal signal is received by the calling task. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ -int wait_on_page_private_2_killable(struct page *page) +int folio_wait_private_2_killable(struct folio *folio) { int ret = 0; - page = compound_head(page); - while (PagePrivate2(page)) { - ret = wait_on_page_bit_killable(page, PG_private_2); + while (folio_test_private_2(folio)) { + ret = folio_wait_bit_killable(folio, PG_private_2); if (ret < 0) break; } return ret; } -EXPORT_SYMBOL(wait_on_page_private_2_killable); +EXPORT_SYMBOL(folio_wait_private_2_killable); /** - * end_page_writeback - end writeback against a page - * @page: the page + * folio_end_writeback - End writeback against a folio. + * @folio: The folio. */ -void end_page_writeback(struct page *page) +void folio_end_writeback(struct folio *folio) { /* - * TestClearPageReclaim could be used here but it is an atomic - * operation and overkill in this particular case. Failing to - * shuffle a page marked for immediate reclaim is too mild to - * justify taking an atomic operation penalty at the end of - * ever page writeback. + * folio_test_clear_reclaim() could be used here but it is an + * atomic operation and overkill in this particular case. Failing + * to shuffle a folio marked for immediate reclaim is too mild + * a gain to justify taking an atomic operation penalty at the + * end of every folio writeback. */ - if (PageReclaim(page)) { - ClearPageReclaim(page); - rotate_reclaimable_page(page); + if (folio_test_reclaim(folio)) { + folio_clear_reclaim(folio); + folio_rotate_reclaimable(folio); } /* - * Writeback does not hold a page reference of its own, relying + * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. - * But here we must make sure that the page is not freed and - * reused before the wake_up_page(). + * But here we must make sure that the folio is not freed and + * reused before the folio_wake(). */ - get_page(page); - if (!test_clear_page_writeback(page)) + folio_get(folio); + if (!__folio_end_writeback(folio)) BUG(); smp_mb__after_atomic(); - wake_up_page(page, PG_writeback); - put_page(page); + folio_wake(folio, PG_writeback); + folio_put(folio); } -EXPORT_SYMBOL(end_page_writeback); +EXPORT_SYMBOL(folio_end_writeback); /* * After completing I/O on a page, call this routine to update the page @@ -1638,39 +1625,35 @@ void page_endio(struct page *page, bool is_write, int err) EXPORT_SYMBOL_GPL(page_endio); /** - * __lock_page - get a lock on the page, assuming we need to sleep to get it - * @__page: the page to lock + * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. + * @folio: The folio to lock */ -void __lock_page(struct page *__page) +void __folio_lock(struct folio *folio) { - struct page *page = compound_head(__page); - wait_queue_head_t *q = page_waitqueue(page); - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, + folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); } -EXPORT_SYMBOL(__lock_page); +EXPORT_SYMBOL(__folio_lock); -int __lock_page_killable(struct page *__page) +int __folio_lock_killable(struct folio *folio) { - struct page *page = compound_head(__page); - wait_queue_head_t *q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, + return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, EXCLUSIVE); } -EXPORT_SYMBOL_GPL(__lock_page_killable); +EXPORT_SYMBOL_GPL(__folio_lock_killable); -int __lock_page_async(struct page *page, struct wait_page_queue *wait) +static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { - struct wait_queue_head *q = page_waitqueue(page); + struct wait_queue_head *q = folio_waitqueue(folio); int ret = 0; - wait->page = page; + wait->folio = folio; wait->bit_nr = PG_locked; spin_lock_irq(&q->lock); __add_wait_queue_entry_tail(q, &wait->wait); - SetPageWaiters(page); - ret = !trylock_page(page); + folio_set_waiters(folio); + ret = !folio_trylock(folio); /* * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's @@ -1687,16 +1670,16 @@ int __lock_page_async(struct page *page, struct wait_page_queue *wait) /* * Return values: - * 1 - page is locked; mmap_lock is still held. - * 0 - page is not locked. + * true - folio is locked; mmap_lock is still held. + * false - folio is not locked. * mmap_lock has been released (mmap_read_unlock(), unless flags had both * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in * which case mmap_lock is still held. * - * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 - * with the page locked and the mmap_lock unperturbed. + * If neither ALLOW_RETRY nor KILLABLE are set, will always return true + * with the folio locked and the mmap_lock unperturbed. */ -int __lock_page_or_retry(struct page *page, struct mm_struct *mm, +bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, unsigned int flags) { if (fault_flag_allow_retry_first(flags)) { @@ -1705,28 +1688,28 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, * even though return 0. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) - return 0; + return false; mmap_read_unlock(mm); if (flags & FAULT_FLAG_KILLABLE) - wait_on_page_locked_killable(page); + folio_wait_locked_killable(folio); else - wait_on_page_locked(page); - return 0; + folio_wait_locked(folio); + return false; } if (flags & FAULT_FLAG_KILLABLE) { - int ret; + bool ret; - ret = __lock_page_killable(page); + ret = __folio_lock_killable(folio); if (ret) { mmap_read_unlock(mm); - return 0; + return false; } } else { - __lock_page(page); + __folio_lock(folio); } - return 1; + return true; } /** @@ -1802,143 +1785,155 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, EXPORT_SYMBOL(page_cache_prev_miss); /* + * Lockless page cache protocol: + * On the lookup side: + * 1. Load the folio from i_pages + * 2. Increment the refcount if it's not zero + * 3. If the folio is not found by xas_reload(), put the refcount and retry + * + * On the removal side: + * A. Freeze the page (by zeroing the refcount if nobody else has a reference) + * B. Remove the page from i_pages + * C. Return the page to the page allocator + * + * This means that any page may have its reference count temporarily + * increased by a speculative page cache (or fast GUP) lookup as it can + * be allocated by another user before the RCU grace period expires. + * Because the refcount temporarily acquired here may end up being the + * last refcount on the page, any page allocation must be freeable by + * folio_put(). + */ + +/* * mapping_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @index. If there is a - * page cache page, the head page is returned with an increased refcount. + * Looks up the page cache entry at @mapping & @index. If it is a folio, + * it is returned with an increased refcount. If it is a shadow entry + * of a previously evicted folio, or a swap entry from shmem/tmpfs, + * it is returned without further action. * - * If the slot holds a shadow entry of a previously evicted page, or a - * swap entry from shmem/tmpfs, it is returned. - * - * Return: The head page or shadow entry, %NULL if nothing is found. + * Return: The folio, swap or shadow entry, %NULL if nothing is found. */ -static struct page *mapping_get_entry(struct address_space *mapping, - pgoff_t index) +static void *mapping_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); - struct page *page; + struct folio *folio; rcu_read_lock(); repeat: xas_reset(&xas); - page = xas_load(&xas); - if (xas_retry(&xas, page)) + folio = xas_load(&xas); + if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ - if (!page || xa_is_value(page)) + if (!folio || xa_is_value(folio)) goto out; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto repeat; - /* - * Has the page moved or been split? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != xas_reload(&xas))) { - put_page(page); + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); goto repeat; } out: rcu_read_unlock(); - return page; + return folio; } /** - * pagecache_get_page - Find and get a reference to a page. + * __filemap_get_folio - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. - * @fgp_flags: %FGP flags modify how the page is returned. - * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified. + * @fgp_flags: %FGP flags modify how the folio is returned. + * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. * * Looks up the page cache entry at @mapping & @index. * * @fgp_flags can be zero or more of these flags: * - * * %FGP_ACCESSED - The page will be marked accessed. - * * %FGP_LOCK - The page is returned locked. - * * %FGP_HEAD - If the page is present and a THP, return the head page - * rather than the exact page specified by the index. + * * %FGP_ACCESSED - The folio will be marked accessed. + * * %FGP_LOCK - The folio is returned locked. * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it - * instead of allocating a new page to replace it. + * instead of allocating a new folio to replace it. * * %FGP_CREAT - If no page is present then a new page is allocated using - * @gfp_mask and added to the page cache and the VM's LRU list. + * @gfp and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the * page is already in cache. If the page was allocated, unlock it before * returning so the caller can do the same dance. - * * %FGP_WRITE - The page will be written - * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask - * * %FGP_NOWAIT - Don't get blocked by page lock + * * %FGP_WRITE - The page will be written to by the caller. + * * %FGP_NOFS - __GFP_FS will get cleared in gfp. + * * %FGP_NOWAIT - Don't get blocked by page lock. + * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * * If there is a page cache page, it is returned with an increased refcount. * - * Return: The found page or %NULL otherwise. + * Return: The found folio or %NULL otherwise. */ -struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp_mask) +struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp) { - struct page *page; + struct folio *folio; repeat: - page = mapping_get_entry(mapping, index); - if (xa_is_value(page)) { + folio = mapping_get_entry(mapping, index); + if (xa_is_value(folio)) { if (fgp_flags & FGP_ENTRY) - return page; - page = NULL; + return folio; + folio = NULL; } - if (!page) + if (!folio) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); return NULL; } } else { - lock_page(page); + folio_lock(folio); } /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto repeat; } - VM_BUG_ON_PAGE(!thp_contains(page, index), page); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); } if (fgp_flags & FGP_ACCESSED) - mark_page_accessed(page); + folio_mark_accessed(folio); else if (fgp_flags & FGP_WRITE) { /* Clear idle flag for buffer write */ - if (page_is_idle(page)) - clear_page_idle(page); + if (folio_test_idle(folio)) + folio_clear_idle(folio); } - if (!(fgp_flags & FGP_HEAD)) - page = find_subpage(page, index); + if (fgp_flags & FGP_STABLE) + folio_wait_stable(folio); no_page: - if (!page && (fgp_flags & FGP_CREAT)) { + if (!folio && (fgp_flags & FGP_CREAT)) { int err; if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) - gfp_mask |= __GFP_WRITE; + gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) - gfp_mask &= ~__GFP_FS; + gfp &= ~__GFP_FS; - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp, 0); + if (!folio) return NULL; if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) @@ -1946,27 +1941,27 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - __SetPageReferenced(page); + __folio_set_referenced(folio); - err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { - put_page(page); - page = NULL; + folio_put(folio); + folio = NULL; if (err == -EEXIST) goto repeat; } /* - * add_to_page_cache_lru locks the page, and for mmap we expect - * an unlocked page. + * filemap_add_folio locks the page, and for mmap + * we expect an unlocked page. */ - if (page && (fgp_flags & FGP_FOR_MMAP)) - unlock_page(page); + if (folio && (fgp_flags & FGP_FOR_MMAP)) + folio_unlock(folio); } - return page; + return folio; } -EXPORT_SYMBOL(pagecache_get_page); +EXPORT_SYMBOL(__filemap_get_folio); static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) @@ -2421,6 +2416,7 @@ static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *iter, struct page *page) { + struct folio *folio = page_folio(page); int error; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -2430,40 +2426,40 @@ static int filemap_update_page(struct kiocb *iocb, filemap_invalidate_lock_shared(mapping); } - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); - put_and_wait_on_page_locked(page, TASK_KILLABLE); + put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } - error = __lock_page_async(page, iocb->ki_waitq); + error = __folio_lock_async(folio, iocb->ki_waitq); if (error) goto unlock_mapping; } error = AOP_TRUNCATED_PAGE; - if (!page->mapping) + if (!folio->mapping) goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; - error = filemap_read_page(iocb->ki_filp, mapping, page); + error = filemap_read_page(iocb->ki_filp, mapping, &folio->page); goto unlock_mapping; unlock: - unlock_page(page); + folio_unlock(folio); unlock_mapping: filemap_invalidate_unlock_shared(mapping); if (error == AOP_TRUNCATED_PAGE) - put_page(page); + folio_put(folio); return error; } @@ -2900,7 +2896,9 @@ unlock: static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, struct file **fpin) { - if (trylock_page(page)) + struct folio *folio = page_folio(page); + + if (folio_trylock(folio)) return 1; /* @@ -2913,7 +2911,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); if (vmf->flags & FAULT_FLAG_KILLABLE) { - if (__lock_page_killable(page)) { + if (__folio_lock_killable(folio)) { /* * We didn't have the right flags to drop the mmap_lock, * but all fault_handlers only check for fatal signals @@ -2925,11 +2923,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, return 0; } } else - __lock_page(page); + __folio_lock(folio); + return 1; } - /* * Synchronous readahead happens when we don't even find a page in the page * cache at all. We don't want to perform IO under the mmap sem, so if we have @@ -3708,28 +3706,6 @@ out: } EXPORT_SYMBOL(generic_file_direct_write); -/* - * Find or create a page at the given pagecache position. Return the locked - * page. This function is specifically for buffered writes. - */ -struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index, unsigned flags) -{ - struct page *page; - int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; - - if (flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - - page = pagecache_get_page(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (page) - wait_for_stable_page(page); - - return page; -} -EXPORT_SYMBOL(grab_cache_page_write_begin); - ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { diff --git a/mm/folio-compat.c b/mm/folio-compat.c new file mode 100644 index 000000000000..5b6ae1da314e --- /dev/null +++ b/mm/folio-compat.c @@ -0,0 +1,142 @@ +/* + * Compatibility functions which bloat the callers too much to make inline. + * All of the callers of these functions should be converted to use folios + * eventually. + */ + +#include <linux/migrate.h> +#include <linux/pagemap.h> +#include <linux/swap.h> + +struct address_space *page_mapping(struct page *page) +{ + return folio_mapping(page_folio(page)); +} +EXPORT_SYMBOL(page_mapping); + +void unlock_page(struct page *page) +{ + return folio_unlock(page_folio(page)); +} +EXPORT_SYMBOL(unlock_page); + +void end_page_writeback(struct page *page) +{ + return folio_end_writeback(page_folio(page)); +} +EXPORT_SYMBOL(end_page_writeback); + +void wait_on_page_writeback(struct page *page) +{ + return folio_wait_writeback(page_folio(page)); +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback); + +void wait_for_stable_page(struct page *page) +{ + return folio_wait_stable(page_folio(page)); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); + +bool page_mapped(struct page *page) +{ + return folio_mapped(page_folio(page)); +} +EXPORT_SYMBOL(page_mapped); + +void mark_page_accessed(struct page *page) +{ + folio_mark_accessed(page_folio(page)); +} +EXPORT_SYMBOL(mark_page_accessed); + +#ifdef CONFIG_MIGRATION +int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page, int extra_count) +{ + return folio_migrate_mapping(mapping, page_folio(newpage), + page_folio(page), extra_count); +} +EXPORT_SYMBOL(migrate_page_move_mapping); + +void migrate_page_states(struct page *newpage, struct page *page) +{ + folio_migrate_flags(page_folio(newpage), page_folio(page)); +} +EXPORT_SYMBOL(migrate_page_states); + +void migrate_page_copy(struct page *newpage, struct page *page) +{ + folio_migrate_copy(page_folio(newpage), page_folio(page)); +} +EXPORT_SYMBOL(migrate_page_copy); +#endif + +bool set_page_writeback(struct page *page) +{ + return folio_start_writeback(page_folio(page)); +} +EXPORT_SYMBOL(set_page_writeback); + +bool set_page_dirty(struct page *page) +{ + return folio_mark_dirty(page_folio(page)); +} +EXPORT_SYMBOL(set_page_dirty); + +int __set_page_dirty_nobuffers(struct page *page) +{ + return filemap_dirty_folio(page_mapping(page), page_folio(page)); +} +EXPORT_SYMBOL(__set_page_dirty_nobuffers); + +bool clear_page_dirty_for_io(struct page *page) +{ + return folio_clear_dirty_for_io(page_folio(page)); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +bool redirty_page_for_writepage(struct writeback_control *wbc, + struct page *page) +{ + return folio_redirty_for_writepage(wbc, page_folio(page)); +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +void lru_cache_add(struct page *page) +{ + folio_add_lru(page_folio(page)); +} +EXPORT_SYMBOL(lru_cache_add); + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + return filemap_add_folio(mapping, page_folio(page), index, gfp); +} +EXPORT_SYMBOL(add_to_page_cache_lru); + +noinline +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp) +{ + struct folio *folio; + + folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); + if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio)) + return &folio->page; + return folio_file_page(folio, index); +} +EXPORT_SYMBOL(pagecache_get_page); + +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) +{ + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + + if (flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; + return pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(grab_cache_page_write_begin); diff --git a/mm/highmem.c b/mm/highmem.c index 4212ad0e4a19..471d9779a7f4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -23,7 +23,6 @@ #include <linux/bio.h> #include <linux/pagemap.h> #include <linux/mempool.h> -#include <linux/blkdev.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5e9ef0fc261e..e5483347291c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -603,7 +603,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); @@ -2405,7 +2405,8 @@ static void __split_huge_page_tail(struct page *head, int tail, static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); + struct page *head = &folio->page; struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; @@ -2424,7 +2425,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = lock_page_lruvec(head); + lruvec = folio_lruvec_lock(folio); + + ClearPageHasHWPoisoned(head); for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); @@ -2700,12 +2703,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { int nr = thp_nr_pages(head); - if (PageSwapBacked(head)) + if (PageSwapBacked(head)) { __mod_lruvec_page_state(head, NR_SHMEM_THPS, -nr); - else + } else { __mod_lruvec_page_state(head, NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } } __split_huge_page(page, list, end); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 95dc7b83381f..6378c1066459 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5302,7 +5302,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; goto out; } - copy_huge_page(page, *pagep); + folio_copy(page_folio(page), page_folio(*pagep)); put_page(*pagep); *pagep = NULL; } diff --git a/mm/internal.h b/mm/internal.h index cf3cb933eba3..b1001ebeb286 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -34,7 +34,16 @@ void page_writeback_init(void); +static inline void *folio_raw_mapping(struct folio *folio) +{ + unsigned long mapping = (unsigned long)folio->mapping; + + return (void *)(mapping & ~PAGE_MAPPING_FLAGS); +} + vm_fault_t do_swap_page(struct vm_fault *vmf); +void folio_rotate_reclaimable(struct folio *folio); +bool __folio_end_writeback(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -63,17 +72,28 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, pgoff_t end, struct pagevec *pvec, pgoff_t *indices); /** - * page_evictable - test whether a page is evictable - * @page: the page to test + * folio_evictable - Test whether a folio is evictable. + * @folio: The folio to test. * - * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. - * - * Reasons page might not be evictable: - * (1) page's mapping marked unevictable - * (2) page is part of an mlocked VMA + * Test whether @folio is evictable -- i.e., should be placed on + * active/inactive lists vs unevictable list. * + * Reasons folio might not be evictable: + * 1. folio's mapping marked unevictable + * 2. One of the pages in the folio is part of an mlocked VMA */ +static inline bool folio_evictable(struct folio *folio) +{ + bool ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(folio_mapping(folio)) && + !folio_test_mlocked(folio); + rcu_read_unlock(); + return ret; +} + static inline bool page_evictable(struct page *page) { bool ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 045cc579f724..5f02fda6f265 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -445,22 +445,25 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, if (!transhuge_vma_enabled(vma, vm_flags)) return false; + if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - + vma->vm_pgoff, HPAGE_PMD_NR)) + return false; + /* Enabled via shmem mount options or sysfs settings. */ - if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) { - return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR); - } + if (shmem_file(vma->vm_file)) + return shmem_huge_enabled(vma); /* THP settings require madvise. */ if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) return false; - /* Read-only file mappings need to be aligned for THP to work. */ + /* Only regular file is valid */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && - !inode_is_open_for_write(vma->vm_file->f_inode) && (vm_flags & VM_EXEC)) { - return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR); + struct inode *inode = vma->vm_file->f_inode; + + return !inode_is_open_for_write(inode) && + S_ISREG(inode->i_mode); } if (!vma->anon_vma || vma->vm_ops) @@ -1087,7 +1090,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { + if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1211,7 +1214,7 @@ out_up_write: mmap_write_unlock(mm); out_nolock: if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(*hpage); + mem_cgroup_uncharge(page_folio(*hpage)); trace_mm_collapse_huge_page(mm, isolated, result); return; } @@ -1658,7 +1661,7 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { + if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } @@ -1763,6 +1766,10 @@ static void collapse_file(struct mm_struct *mm, filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; + } else if (PageWriteback(page)) { + xas_unlock_irq(&xas); + result = SCAN_FAIL; + goto xa_unlocked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); @@ -1798,7 +1805,8 @@ static void collapse_file(struct mm_struct *mm, goto out_unlock; } - if (!is_shmem && PageDirty(page)) { + if (!is_shmem && (PageDirty(page) || + PageWriteback(page))) { /* * khugepaged only works on read-only fd, so this * page is dirty because it hasn't been flushed @@ -1975,7 +1983,7 @@ xa_unlocked: out: VM_BUG_ON(!list_empty(&pagelist)); if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(*hpage); + mem_cgroup_uncharge(page_folio(*hpage)); /* TODO: tracepoints */ } @@ -751,7 +751,7 @@ stale: /* * We come here from above when page->mapping or !PageSwapCache * suggests that the node is stale; but it might be under migration. - * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), + * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); @@ -852,9 +852,14 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +static inline struct stable_node *folio_stable_node(struct folio *folio) +{ + return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; +} + static inline struct stable_node *page_stable_node(struct page *page) { - return PageKsm(page) ? page_rmapping(page) : NULL; + return folio_stable_node(page_folio(page)); } static inline void set_page_stable_node(struct page *page, @@ -2578,7 +2583,8 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* let do_swap_page report the error */ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) { + if (new_page && + mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { put_page(new_page); new_page = NULL; } @@ -2658,26 +2664,26 @@ again: } #ifdef CONFIG_MIGRATION -void ksm_migrate_page(struct page *newpage, struct page *oldpage) +void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct stable_node *stable_node; - VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); + VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); - stable_node = page_stable_node(newpage); + stable_node = folio_stable_node(folio); if (stable_node) { - VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage); - stable_node->kpfn = page_to_pfn(newpage); + VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); + stable_node->kpfn = folio_pfn(newfolio); /* - * newpage->mapping was set in advance; now we need smp_wmb() + * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible - * to get_ksm_page() before it can see that oldpage->mapping - * has gone stale (or that PageSwapCache has been cleared). + * to get_ksm_page() before it can see that folio->mapping + * has gone stale (or that folio_test_swapcache has been cleared). */ smp_wmb(); - set_page_stable_node(oldpage, NULL); + set_page_stable_node(&folio->page, NULL); } } #endif /* CONFIG_MIGRATION */ diff --git a/mm/memblock.c b/mm/memblock.c index 184dcd2e5d99..5096500b2647 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -932,6 +932,9 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) * covered by the memory map. The struct page representing NOMAP memory * frames in the memory map will be PageReserved() * + * Note: if the memory being marked %MEMBLOCK_NOMAP was allocated from + * memblock, the caller must inform kmemleak to ignore that memory + * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) @@ -1687,7 +1690,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) if (!size) return; - if (memblock.memory.cnt <= 1) { + if (!memblock_memory->total_size) { pr_warn("%s: No memory registered yet\n", __func__); return; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6da5020a8656..8dab23a71fc4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -456,28 +456,6 @@ ino_t page_cgroup_ino(struct page *page) return ino; } -static struct mem_cgroup_per_node * -mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) -{ - int nid = page_to_nid(page); - - return memcg->nodeinfo[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_node(int nid) -{ - return soft_limit_tree.rb_tree_per_node[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_from_page(struct page *page) -{ - int nid = page_to_nid(page); - - return soft_limit_tree.rb_tree_per_node[nid]; -} - static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) @@ -548,13 +526,13 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) return excess; } -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) { unsigned long excess; struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; /* @@ -562,7 +540,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_page_nodeinfo(memcg, page); + mz = memcg->nodeinfo[nid]; excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or @@ -593,7 +571,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for_each_node(nid) { mz = memcg->nodeinfo[nid]; - mctz = soft_limit_tree_node(nid); + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (mctz) mem_cgroup_remove_exceeded(mz, mctz); } @@ -799,7 +777,6 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, - struct page *page, int nr_pages) { /* pagein of a big page is an event. So, ignore page size */ @@ -842,7 +819,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, * Check events in order. * */ -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, int nid) { /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, @@ -853,7 +830,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) MEM_CGROUP_TARGET_SOFTLIMIT); mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, page); + mem_cgroup_update_tree(memcg, nid); } } @@ -1149,64 +1126,88 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } #ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) - VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); else - VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); } #endif /** - * lock_page_lruvec - lock and return lruvec for a given page. - * @page: the page + * folio_lruvec_lock - Lock the lruvec for a folio. + * @folio: Pointer to the folio. * * These functions are safe to use under any of the following conditions: - * - page locked - * - PageLRU cleared - * - lock_page_memcg() - * - page->_refcount is zero + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held. */ -struct lruvec *lock_page_lruvec(struct page *page) +struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock(&lruvec->lru_lock); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } -struct lruvec *lock_page_lruvec_irq(struct page *page) +/** + * folio_lruvec_lock_irq - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock_irq(&lruvec->lru_lock); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } -struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +/** + * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * @flags: Pointer to irqsave flags. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, + unsigned long *flags) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock_irqsave(&lruvec->lru_lock, *flags); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } @@ -1956,18 +1957,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } /** - * lock_page_memcg - lock a page and memcg binding - * @page: the page + * folio_memcg_lock - Bind a folio to its memcg. + * @folio: The folio. * - * This function protects unlocked LRU pages from being moved to + * This function prevents unlocked LRU folios from being moved to * another cgroup. * - * It ensures lifetime of the locked memcg. Caller is responsible - * for the lifetime of the page. + * It ensures lifetime of the bound memcg. The caller is responsible + * for the lifetime of the folio. */ -void lock_page_memcg(struct page *page) +void folio_memcg_lock(struct folio *folio) { - struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -1981,7 +1981,7 @@ void lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return; again: - memcg = page_memcg(head); + memcg = folio_memcg(folio); if (unlikely(!memcg)) return; @@ -1995,7 +1995,7 @@ again: return; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page_memcg(head)) { + if (memcg != folio_memcg(folio)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2009,9 +2009,15 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; } +EXPORT_SYMBOL(folio_memcg_lock); + +void lock_page_memcg(struct page *page) +{ + folio_memcg_lock(page_folio(page)); +} EXPORT_SYMBOL(lock_page_memcg); -static void __unlock_page_memcg(struct mem_cgroup *memcg) +static void __folio_memcg_unlock(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -2026,14 +2032,22 @@ static void __unlock_page_memcg(struct mem_cgroup *memcg) } /** - * unlock_page_memcg - unlock a page and memcg binding - * @page: the page + * folio_memcg_unlock - Release the binding between a folio and its memcg. + * @folio: The folio. + * + * This releases the binding created by folio_memcg_lock(). This does + * not change the accounting of this folio to its memcg, but it does + * permit others to change it. */ -void unlock_page_memcg(struct page *page) +void folio_memcg_unlock(struct folio *folio) { - struct page *head = compound_head(page); + __folio_memcg_unlock(folio_memcg(folio)); +} +EXPORT_SYMBOL(folio_memcg_unlock); - __unlock_page_memcg(page_memcg(head)); +void unlock_page_memcg(struct page *page) +{ + folio_memcg_unlock(page_folio(page)); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2734,9 +2748,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) } #endif -static void commit_charge(struct page *page, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page_memcg(page), page); + VM_BUG_ON_FOLIO(folio_memcg(folio), folio); /* * Any of the following ensures page's memcg stability: * @@ -2745,7 +2759,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - lock_page_memcg() * - exclusive reference */ - page->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)memcg; } static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) @@ -3015,15 +3029,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { + struct folio *folio = page_folio(page); struct obj_cgroup *objcg; unsigned int nr_pages = 1 << order; - if (!PageMemcgKmem(page)) + if (!folio_memcg_kmem(folio)) return; - objcg = __page_objcg(page); + objcg = __folio_objcg(folio); obj_cgroup_uncharge_pages(objcg, nr_pages); - page->memcg_data = 0; + folio->memcg_data = 0; obj_cgroup_put(objcg); } @@ -3257,17 +3272,18 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) */ void split_page_memcg(struct page *head, unsigned int nr) { - struct mem_cgroup *memcg = page_memcg(head); + struct folio *folio = page_folio(head); + struct mem_cgroup *memcg = folio_memcg(folio); int i; if (mem_cgroup_disabled() || !memcg) return; for (i = 1; i < nr; i++) - head[i].memcg_data = head->memcg_data; + folio_page(folio, i)->memcg_data = folio->memcg_data; - if (PageMemcgKmem(head)) - obj_cgroup_get_many(__page_objcg(head), nr - 1); + if (folio_memcg_kmem(folio)) + obj_cgroup_get_many(__folio_objcg(folio), nr - 1); else css_get_many(&memcg->css, nr - 1); } @@ -3381,7 +3397,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, if (order > 0) return 0; - mctz = soft_limit_tree_node(pgdat->node_id); + mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; /* * Do not even bother to check the largest node if the root @@ -4537,17 +4553,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, * As being wrong occasionally doesn't matter, updates and accesses to the * records are lockless and racy. */ -void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; int oldest = -1; int i; - trace_track_foreign_dirty(page, wb); + trace_track_foreign_dirty(folio, wb); /* * Pick the slot to use. If there is already a slot for @wb, keep @@ -5575,38 +5591,39 @@ static int mem_cgroup_move_account(struct page *page, struct mem_cgroup *from, struct mem_cgroup *to) { + struct folio *folio = page_folio(page); struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; - unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; - int ret; + unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; + int nid, ret; VM_BUG_ON(from == to); - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON(compound && !PageTransHuge(page)); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON(compound && !folio_test_multi(folio)); /* * Prevent mem_cgroup_migrate() from looking at * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto out; ret = -EINVAL; - if (page_memcg(page) != from) + if (folio_memcg(folio) != from) goto out_unlock; - pgdat = page_pgdat(page); + pgdat = folio_pgdat(folio); from_vec = mem_cgroup_lruvec(from, pgdat); to_vec = mem_cgroup_lruvec(to, pgdat); - lock_page_memcg(page); + folio_memcg_lock(folio); - if (PageAnon(page)) { - if (page_mapped(page)) { + if (folio_test_anon(folio)) { + if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); - if (PageTransHuge(page)) { + if (folio_test_transhuge(folio)) { __mod_lruvec_state(from_vec, NR_ANON_THPS, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_THPS, @@ -5617,18 +5634,18 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); - if (PageSwapBacked(page)) { + if (folio_test_swapbacked(folio)) { __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); } - if (page_mapped(page)) { + if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); } - if (PageDirty(page)) { - struct address_space *mapping = page_mapping(page); + if (folio_test_dirty(folio)) { + struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { __mod_lruvec_state(from_vec, NR_FILE_DIRTY, @@ -5639,7 +5656,7 @@ static int mem_cgroup_move_account(struct page *page, } } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); } @@ -5662,20 +5679,21 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - page->memcg_data = (unsigned long)to; + folio->memcg_data = (unsigned long)to; - __unlock_page_memcg(from); + __folio_memcg_unlock(from); ret = 0; + nid = folio_nid(folio); local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); + mem_cgroup_charge_statistics(to, nr_pages); + memcg_check_events(to, nid); + mem_cgroup_charge_statistics(from, -nr_pages); + memcg_check_events(from, nid); local_irq_enable(); out_unlock: - unlock_page(page); + folio_unlock(folio); out: return ret; } @@ -6680,9 +6698,10 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) +static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, + gfp_t gfp) { - unsigned int nr_pages = thp_nr_pages(page); + long nr_pages = folio_nr_pages(folio); int ret; ret = try_charge(memcg, gfp, nr_pages); @@ -6690,38 +6709,23 @@ static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) goto out; css_get(&memcg->css); - commit_charge(page, memcg); + commit_charge(folio, memcg); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); - memcg_check_events(memcg, page); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(folio)); local_irq_enable(); out: return ret; } -/** - * __mem_cgroup_charge - charge a newly allocated page to a cgroup - * @page: page to charge - * @mm: mm context of the victim - * @gfp_mask: reclaim mode - * - * Try to charge @page to the memcg that @mm belongs to, reclaiming - * pages according to @gfp_mask if necessary. if @mm is NULL, try to - * charge to the active memcg. - * - * Do not use this for pages allocated for swapin. - * - * Returns 0 on success. Otherwise, an error code is returned. - */ -int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = charge_memcg(page, memcg, gfp_mask); + ret = charge_memcg(folio, memcg, gfp); css_put(&memcg->css); return ret; @@ -6742,6 +6746,7 @@ int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { + struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; @@ -6756,7 +6761,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = charge_memcg(page, memcg, gfp); + ret = charge_memcg(folio, memcg, gfp); css_put(&memcg->css); return ret; @@ -6800,7 +6805,7 @@ struct uncharge_gather { unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; - struct page *dummy_page; + int nid; }; static inline void uncharge_gather_clear(struct uncharge_gather *ug) @@ -6824,36 +6829,36 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg_check_events(ug->memcg, ug->dummy_page); + memcg_check_events(ug->memcg, ug->nid); local_irq_restore(flags); - /* drop reference from uncharge_page */ + /* drop reference from uncharge_folio */ css_put(&ug->memcg->css); } -static void uncharge_page(struct page *page, struct uncharge_gather *ug) +static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { - unsigned long nr_pages; + long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - bool use_objcg = PageMemcgKmem(page); + bool use_objcg = folio_memcg_kmem(folio); - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * page memcg or objcg at this point, we have fully - * exclusive access to the page. + * folio memcg or objcg at this point, we have fully + * exclusive access to the folio. */ if (use_objcg) { - objcg = __page_objcg(page); + objcg = __folio_objcg(folio); /* * This get matches the put at the end of the function and * kmem pages do not hold memcg references anymore. */ memcg = get_mem_cgroup_from_objcg(objcg); } else { - memcg = __page_memcg(page); + memcg = __folio_memcg(folio); } if (!memcg) @@ -6865,19 +6870,19 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) uncharge_gather_clear(ug); } ug->memcg = memcg; - ug->dummy_page = page; + ug->nid = folio_nid(folio); /* pairs with css_put in uncharge_batch */ css_get(&memcg->css); } - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (use_objcg) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - page->memcg_data = 0; + folio->memcg_data = 0; obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ @@ -6885,28 +6890,22 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->nr_memory += nr_pages; ug->pgpgout++; - page->memcg_data = 0; + folio->memcg_data = 0; } css_put(&memcg->css); } -/** - * __mem_cgroup_uncharge - uncharge a page - * @page: page to uncharge - * - * Uncharge a page previously charged with __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct folio *folio) { struct uncharge_gather ug; - /* Don't touch page->lru of any random page, pre-check: */ - if (!page_memcg(page)) + /* Don't touch folio->lru of any random page, pre-check: */ + if (!folio_memcg(folio)) return; uncharge_gather_clear(&ug); - uncharge_page(page, &ug); + uncharge_folio(folio, &ug); uncharge_batch(&ug); } @@ -6920,52 +6919,49 @@ void __mem_cgroup_uncharge(struct page *page) void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; - struct page *page; + struct folio *folio; uncharge_gather_clear(&ug); - list_for_each_entry(page, page_list, lru) - uncharge_page(page, &ug); + list_for_each_entry(folio, page_list, lru) + uncharge_folio(folio, &ug); if (ug.memcg) uncharge_batch(&ug); } /** - * mem_cgroup_migrate - charge a page's replacement - * @oldpage: currently circulating page - * @newpage: replacement page + * mem_cgroup_migrate - Charge a folio's replacement. + * @old: Currently circulating folio. + * @new: Replacement folio. * - * Charge @newpage as a replacement page for @oldpage. @oldpage will + * Charge @new as a replacement folio for @old. @old will * be uncharged upon free. * - * Both pages must be locked, @newpage->mapping must be set up. + * Both folios must be locked, @new->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; - unsigned int nr_pages; + long nr_pages = folio_nr_pages(new); unsigned long flags; - VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); - VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), - newpage); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); + VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); if (mem_cgroup_disabled()) return; - /* Page cache replacement: new page already charged? */ - if (page_memcg(newpage)) + /* Page cache replacement: new folio already charged? */ + if (folio_memcg(new)) return; - memcg = page_memcg(oldpage); - VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); + memcg = folio_memcg(old); + VM_WARN_ON_ONCE_FOLIO(!memcg, old); if (!memcg) return; /* Force-charge the new page. The old one will be freed soon */ - nr_pages = thp_nr_pages(newpage); - if (!mem_cgroup_is_root(memcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) @@ -6973,11 +6969,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) } css_get(&memcg->css); - commit_charge(newpage, memcg); + commit_charge(new, memcg); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, nr_pages); - memcg_check_events(memcg, newpage); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(new)); local_irq_restore(flags); } @@ -7204,8 +7200,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -nr_entries); - memcg_check_events(memcg, page); + mem_cgroup_charge_statistics(memcg, -nr_entries); + memcg_check_events(memcg, page_to_nid(page)); css_put(&memcg->css); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3e6449f2102a..93078a2859a7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -762,7 +762,7 @@ static int delete_from_lru_cache(struct page *p) * Poisoned page might never drop its ref count to 0 so we have * to uncharge it manually from its memcg. */ - mem_cgroup_uncharge(p); + mem_cgroup_uncharge(page_folio(p)); /* * drop the page count elevated by isolate_lru_page() @@ -1147,20 +1147,6 @@ static int __get_hwpoison_page(struct page *page) if (!HWPoisonHandlable(head)) return -EBUSY; - if (PageTransHuge(head)) { - /* - * Non anonymous thp exists only in allocation/free time. We - * can't handle such a case correctly, so let's give it up. - * This should be better than triggering BUG_ON when kernel - * tries to touch the "partially handled" page. - */ - if (!PageAnon(head)) { - pr_err("Memory failure: %#lx: non anonymous thp\n", - page_to_pfn(page)); - return 0; - } - } - if (get_page_unless_zero(head)) { if (head == compound_head(page)) return 1; @@ -1708,6 +1694,20 @@ try_again: } if (PageTransHuge(hpage)) { + /* + * The flag must be set after the refcount is bumped + * otherwise it may race with THP split. + * And the flag can't be set in get_hwpoison_page() since + * it is called by soft offline too and it is just called + * for !MF_COUNT_INCREASE. So here seems to be the best + * place. + * + * Don't need care about the above error handling paths for + * get_hwpoison_page() since they handle either free page + * or unhandlable page. The refcount is bumped iff the + * page is a valid handlable page. + */ + SetPageHasHWPoisoned(hpage); if (try_to_split_thp_page(p, "Memory Failure") < 0) { action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); res = -EBUSY; diff --git a/mm/memory.c b/mm/memory.c index adf9b9ef8277..bcc4b0727a63 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -990,7 +990,7 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, if (!new_page) return NULL; - if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { + if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) { put_page(new_page); return NULL; } @@ -3019,7 +3019,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) goto oom_free_new; cgroup_throttle_swaprate(new_page, GFP_KERNEL); @@ -3539,7 +3539,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) shadow = get_shadow_from_swap_cache(entry); if (shadow) - workingset_refault(page, shadow); + workingset_refault(page_folio(page), + shadow); lru_cache_add(page); @@ -3769,7 +3770,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) goto oom_free_page; cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -3907,6 +3908,15 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) return ret; /* + * Just backoff if any subpage of a THP is corrupted otherwise + * the corrupted page may mapped by PMD silently to escape the + * check. This kind of THP just can be PTE mapped. Access to + * the corrupted subpage should trigger SIGBUS as expected. + */ + if (unlikely(PageHasHWPoisoned(page))) + return ret; + + /* * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ @@ -4193,7 +4203,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { + if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm, + GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } @@ -4258,7 +4269,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults). * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * return value. See filemap_fault() and __folio_lock_or_retry(). * If mmap_lock is released, vma may become invalid (for example * by other thread calling munmap()). */ @@ -4499,7 +4510,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) * concurrent faults). * * The mmap_lock may have been released depending on flags and our return value. - * See filemap_fault() and __lock_page_or_retry(). + * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { @@ -4603,7 +4614,7 @@ unlock: * By the time we get here, we already hold the mm semaphore * * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * return value. See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -4759,7 +4770,7 @@ static inline void mm_account_fault(struct pt_regs *regs, * By the time we get here, we already hold the mm semaphore * * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) @@ -5256,7 +5267,7 @@ void __might_fault(const char *file, int line) return; if (pagefault_disabled()) return; - __might_sleep(file, line, 0); + __might_sleep(file, line); #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1592b081c58e..f4b4be7af4d3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -856,16 +856,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } - if (flags & MPOL_F_NUMA_BALANCING) { - if (new && new->mode == MPOL_BIND) { - new->flags |= (MPOL_F_MOF | MPOL_F_MORON); - } else { - ret = -EINVAL; - mpol_put(new); - goto out; - } - } - ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { mpol_put(new); @@ -1458,7 +1448,11 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; - + if (*flags & MPOL_F_NUMA_BALANCING) { + if (*mode != MPOL_BIND) + return -EINVAL; + *flags |= (MPOL_F_MOF | MPOL_F_MORON); + } return 0; } @@ -2202,6 +2196,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages); +struct folio *folio_alloc(gfp_t gfp, unsigned order) +{ + struct page *page = alloc_pages(gfp | __GFP_COMP, order); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; +} +EXPORT_SYMBOL(folio_alloc); + int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(vma_policy(src)); diff --git a/mm/mempool.c b/mm/mempool.c index 0b8afbec3e35..b933d0fc21b8 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -17,7 +17,6 @@ #include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> -#include <linux/blkdev.h> #include <linux/writeback.h> #include "slab.h" diff --git a/mm/memremap.c b/mm/memremap.c index ed593bf87109..5a66a71ab591 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -505,7 +505,7 @@ void free_devmap_managed_page(struct page *page) __ClearPageWaiters(page); - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(page_folio(page)); /* * When a device_private page is freed, the page->mapping field diff --git a/mm/migrate.c b/mm/migrate.c index a6a7743ee98f..efa9941ebe03 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -364,7 +364,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) */ expected_count += is_device_private_page(page); if (mapping) - expected_count += thp_nr_pages(page) + page_has_private(page); + expected_count += compound_nr(page) + page_has_private(page); return expected_count; } @@ -377,74 +377,75 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ -int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, int extra_count) +int folio_migrate_mapping(struct address_space *mapping, + struct folio *newfolio, struct folio *folio, int extra_count) { - XA_STATE(xas, &mapping->i_pages, page_index(page)); + XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct zone *oldzone, *newzone; int dirty; - int expected_count = expected_page_refs(mapping, page) + extra_count; - int nr = thp_nr_pages(page); + int expected_count = expected_page_refs(mapping, &folio->page) + extra_count; + long nr = folio_nr_pages(folio); if (!mapping) { /* Anonymous page without mapping */ - if (page_count(page) != expected_count) + if (folio_ref_count(folio) != expected_count) return -EAGAIN; /* No turning back from here */ - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapBacked(page)) - __SetPageSwapBacked(newpage); + newfolio->index = folio->index; + newfolio->mapping = folio->mapping; + if (folio_test_swapbacked(folio)) + __folio_set_swapbacked(newfolio); return MIGRATEPAGE_SUCCESS; } - oldzone = page_zone(page); - newzone = page_zone(newpage); + oldzone = folio_zone(folio); + newzone = folio_zone(newfolio); xas_lock_irq(&xas); - if (page_count(page) != expected_count || xas_load(&xas) != page) { + if (folio_ref_count(folio) != expected_count || + xas_load(&xas) != folio) { xas_unlock_irq(&xas); return -EAGAIN; } - if (!page_ref_freeze(page, expected_count)) { + if (!folio_ref_freeze(folio, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; } /* - * Now we know that no one else is looking at the page: + * Now we know that no one else is looking at the folio: * no turning back from here. */ - newpage->index = page->index; - newpage->mapping = page->mapping; - page_ref_add(newpage, nr); /* add cache reference */ - if (PageSwapBacked(page)) { - __SetPageSwapBacked(newpage); - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); + newfolio->index = folio->index; + newfolio->mapping = folio->mapping; + folio_ref_add(newfolio, nr); /* add cache reference */ + if (folio_test_swapbacked(folio)) { + __folio_set_swapbacked(newfolio); + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); } } else { - VM_BUG_ON_PAGE(PageSwapCache(page), page); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); } /* Move dirty while page refs frozen and newpage not yet exposed */ - dirty = PageDirty(page); + dirty = folio_test_dirty(folio); if (dirty) { - ClearPageDirty(page); - SetPageDirty(newpage); + folio_clear_dirty(folio); + folio_set_dirty(newfolio); } - xas_store(&xas, newpage); - if (PageTransHuge(page)) { + xas_store(&xas, newfolio); + if (nr > 1) { int i; for (i = 1; i < nr; i++) { xas_next(&xas); - xas_store(&xas, newpage); + xas_store(&xas, newfolio); } } @@ -453,7 +454,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - nr); + folio_ref_unfreeze(folio, expected_count - nr); xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ @@ -472,18 +473,18 @@ int migrate_page_move_mapping(struct address_space *mapping, struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; - memcg = page_memcg(page); + memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); - if (PageSwapBacked(page) && !PageSwapCache(page)) { + if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } #ifdef CONFIG_SWAP - if (PageSwapCache(page)) { + if (folio_test_swapcache(folio)) { __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); } @@ -499,11 +500,11 @@ int migrate_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } -EXPORT_SYMBOL(migrate_page_move_mapping); +EXPORT_SYMBOL(folio_migrate_mapping); /* * The expected number of remaining references is the same as that - * of migrate_page_move_mapping(). + * of folio_migrate_mapping(). */ int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) @@ -538,91 +539,87 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, } /* - * Copy the page to its new location + * Copy the flags and some other ancillary information */ -void migrate_page_states(struct page *newpage, struct page *page) +void folio_migrate_flags(struct folio *newfolio, struct folio *folio) { int cpupid; - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (TestClearPageActive(page)) { - VM_BUG_ON_PAGE(PageUnevictable(page), page); - SetPageActive(newpage); - } else if (TestClearPageUnevictable(page)) - SetPageUnevictable(newpage); - if (PageWorkingset(page)) - SetPageWorkingset(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); - - /* Move dirty on pages not done by migrate_page_move_mapping() */ - if (PageDirty(page)) - SetPageDirty(newpage); - - if (page_is_young(page)) - set_page_young(newpage); - if (page_is_idle(page)) - set_page_idle(newpage); + if (folio_test_error(folio)) + folio_set_error(newfolio); + if (folio_test_referenced(folio)) + folio_set_referenced(newfolio); + if (folio_test_uptodate(folio)) + folio_mark_uptodate(newfolio); + if (folio_test_clear_active(folio)) { + VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio); + folio_set_active(newfolio); + } else if (folio_test_clear_unevictable(folio)) + folio_set_unevictable(newfolio); + if (folio_test_workingset(folio)) + folio_set_workingset(newfolio); + if (folio_test_checked(folio)) + folio_set_checked(newfolio); + if (folio_test_mappedtodisk(folio)) + folio_set_mappedtodisk(newfolio); + + /* Move dirty on pages not done by folio_migrate_mapping() */ + if (folio_test_dirty(folio)) + folio_set_dirty(newfolio); + + if (folio_test_young(folio)) + folio_set_young(newfolio); + if (folio_test_idle(folio)) + folio_set_idle(newfolio); /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. */ - cpupid = page_cpupid_xchg_last(page, -1); - page_cpupid_xchg_last(newpage, cpupid); + cpupid = page_cpupid_xchg_last(&folio->page, -1); + page_cpupid_xchg_last(&newfolio->page, cpupid); - ksm_migrate_page(newpage, page); + folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). */ - if (PageSwapCache(page)) - ClearPageSwapCache(page); - ClearPagePrivate(page); + if (folio_test_swapcache(folio)) + folio_clear_swapcache(folio); + folio_clear_private(folio); /* page->private contains hugetlb specific flags */ - if (!PageHuge(page)) - set_page_private(page, 0); + if (!folio_test_hugetlb(folio)) + folio->private = NULL; /* * If any waiters have accumulated on the new page then * wake them up. */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); + if (folio_test_writeback(newfolio)) + folio_end_writeback(newfolio); /* * PG_readahead shares the same bit with PG_reclaim. The above * end_page_writeback() may clear PG_readahead mistakenly, so set the * bit after that. */ - if (PageReadahead(page)) - SetPageReadahead(newpage); + if (folio_test_readahead(folio)) + folio_set_readahead(newfolio); - copy_page_owner(page, newpage); + folio_copy_owner(newfolio, folio); - if (!PageHuge(page)) - mem_cgroup_migrate(page, newpage); + if (!folio_test_hugetlb(folio)) + mem_cgroup_migrate(folio, newfolio); } -EXPORT_SYMBOL(migrate_page_states); +EXPORT_SYMBOL(folio_migrate_flags); -void migrate_page_copy(struct page *newpage, struct page *page) +void folio_migrate_copy(struct folio *newfolio, struct folio *folio) { - if (PageHuge(page) || PageTransHuge(page)) - copy_huge_page(newpage, page); - else - copy_highpage(newpage, page); - - migrate_page_states(newpage, page); + folio_copy(newfolio, folio); + folio_migrate_flags(newfolio, folio); } -EXPORT_SYMBOL(migrate_page_copy); +EXPORT_SYMBOL(folio_migrate_copy); /************************************************************ * Migration functions @@ -638,19 +635,21 @@ int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { + struct folio *newfolio = page_folio(newpage); + struct folio *folio = page_folio(page); int rc; - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, 0); + rc = folio_migrate_mapping(mapping, newfolio, folio, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(newfolio, folio); else - migrate_page_states(newpage, page); + folio_migrate_flags(newfolio, folio); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -2468,7 +2467,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * @page: struct page to check * * Pinned pages cannot be migrated. This is the same test as in - * migrate_page_move_mapping(), except that here we allow migration of a + * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ static bool migrate_vma_check_page(struct page *page) @@ -2846,7 +2845,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) goto abort; /* @@ -3066,7 +3065,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ -#if defined(CONFIG_MEMORY_HOTPLUG) +#if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { @@ -3209,25 +3208,6 @@ static void set_migration_target_nodes(void) } /* - * React to hotplug events that might affect the migration targets - * like events that online or offline NUMA nodes. - * - * The ordering is also currently dependent on which nodes have - * CPUs. That means we need CPU on/offline notification too. - */ -static int migration_online_cpu(unsigned int cpu) -{ - set_migration_target_nodes(); - return 0; -} - -static int migration_offline_cpu(unsigned int cpu) -{ - set_migration_target_nodes(); - return 0; -} - -/* * This leaves migrate-on-reclaim transiently disabled between * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs * whether reclaim-based migration is enabled or not, which @@ -3239,8 +3219,18 @@ static int migration_offline_cpu(unsigned int cpu) * set_migration_target_nodes(). */ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, - unsigned long action, void *arg) + unsigned long action, void *_arg) { + struct memory_notify *arg = _arg; + + /* + * Only update the node migration order when a node is + * changing status, like online->offline. This avoids + * the overhead of synchronize_rcu() in most cases. + */ + if (arg->status_change_nid < 0) + return notifier_from_errno(0); + switch (action) { case MEM_GOING_OFFLINE: /* @@ -3274,13 +3264,31 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, return notifier_from_errno(0); } +/* + * React to hotplug events that might affect the migration targets + * like events that online or offline NUMA nodes. + * + * The ordering is also currently dependent on which nodes have + * CPUs. That means we need CPU on/offline notification too. + */ +static int migration_online_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + +static int migration_offline_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + static int __init migrate_on_reclaim_init(void) { int ret; - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim", - migration_online_cpu, - migration_offline_cpu); + ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", + NULL, migration_offline_cpu); /* * In the unlikely case that this fails, the automatic * migration targets may become suboptimal for nodes @@ -3288,9 +3296,12 @@ static int __init migrate_on_reclaim_init(void) * rare case, do not bother trying to do anything special. */ WARN_ON(ret < 0); + ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online", + migration_online_cpu, NULL); + WARN_ON(ret < 0); hotplug_memory_notifier(migrate_on_reclaim_callback, 100); return 0; } late_initcall(migrate_on_reclaim_init); -#endif /* CONFIG_MEMORY_HOTPLUG */ +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/mm/mlock.c b/mm/mlock.c index 16d2ee160d43..e263d62ae2d0 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -271,6 +271,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) /* Phase 1: page isolation */ for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(page); if (TestClearPageMlocked(page)) { /* @@ -278,7 +279,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * so we can spare the get_page() here. */ if (TestClearPageLRU(page)) { - lruvec = relock_page_lruvec_irq(page, lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); del_page_from_lru_list(page, lruvec); continue; } else diff --git a/mm/nommu.c b/mm/nommu.c index 02d2427b8f9e..41ef204e7482 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -27,7 +27,6 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/compiler.h> #include <linux/mount.h> diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 831340e7ad8b..989f35a2bbb1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1150,7 +1150,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) struct task_struct *task; struct task_struct *p; unsigned int f_flags; - bool reap = true; + bool reap = false; struct pid *pid; long ret = 0; @@ -1177,15 +1177,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - mm = p->mm; - mmgrab(mm); - - /* If the work has been done already, just exit with success */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) - reap = false; - else if (!task_will_free_mem(p)) { - reap = false; - ret = -EINVAL; + if (mmget_not_zero(p->mm)) { + mm = p->mm; + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; + } } task_unlock(p); @@ -1201,7 +1201,8 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) mmap_read_unlock(mm); drop_mm: - mmdrop(mm); + if (mm) + mmput(mm); put_task: put_task_struct(task); put_pid: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4812a17b288c..9c64490171e0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -562,12 +562,12 @@ static unsigned long wp_next_time(unsigned long cur_time) return cur_time; } -static void wb_domain_writeout_inc(struct wb_domain *dom, +static void wb_domain_writeout_add(struct wb_domain *dom, struct fprop_local_percpu *completions, - unsigned int max_prop_frac) + unsigned int max_prop_frac, long nr) { - __fprop_inc_percpu_max(&dom->completions, completions, - max_prop_frac); + __fprop_add_percpu_max(&dom->completions, completions, + max_prop_frac, nr); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* @@ -583,20 +583,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom, /* * Increment @wb's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). + * completion count. Called from __folio_end_writeback(). */ -static inline void __wb_writeout_inc(struct bdi_writeback *wb) +static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) { struct wb_domain *cgdom; - inc_wb_stat(wb, WB_WRITTEN); - wb_domain_writeout_inc(&global_wb_domain, &wb->completions, - wb->bdi->max_prop_frac); + wb_stat_mod(wb, WB_WRITTEN, nr); + wb_domain_writeout_add(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac, nr); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) - wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), - wb->bdi->max_prop_frac); + wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac, nr); } void wb_writeout_inc(struct bdi_writeback *wb) @@ -604,7 +604,7 @@ void wb_writeout_inc(struct bdi_writeback *wb) unsigned long flags; local_irq_save(flags); - __wb_writeout_inc(wb); + __wb_writeout_add(wb, 1); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); @@ -1084,7 +1084,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, * write_bandwidth = --------------------------------------------------- * period * - * @written may have decreased due to account_page_redirty(). + * @written may have decreased due to folio_account_redirty(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); @@ -2381,44 +2381,44 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) } /** - * write_one_page - write out a single page and wait on I/O - * @page: the page to write + * folio_write_one - write out a single folio and wait on I/O. + * @folio: The folio to write. * - * The page must be locked by the caller and will be unlocked upon return. + * The folio must be locked by the caller and will be unlocked upon return. * * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. * * Return: %0 on success, negative error code otherwise */ -int write_one_page(struct page *page) +int folio_write_one(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, + .nr_to_write = folio_nr_pages(folio), }; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (clear_page_dirty_for_io(page)) { - get_page(page); - ret = mapping->a_ops->writepage(page, &wbc); + if (folio_clear_dirty_for_io(folio)) { + folio_get(folio); + ret = mapping->a_ops->writepage(&folio->page, &wbc); if (ret == 0) - wait_on_page_writeback(page); - put_page(page); + folio_wait_writeback(folio); + folio_put(folio); } else { - unlock_page(page); + folio_unlock(folio); } if (!ret) ret = filemap_check_errors(mapping); return ret; } -EXPORT_SYMBOL(write_one_page); +EXPORT_SYMBOL(folio_write_one); /* * For address_spaces which do not use buffers nor write back. @@ -2438,29 +2438,30 @@ EXPORT_SYMBOL(__set_page_dirty_no_writeback); * * NOTE: This relies on being atomic wrt interrupts. */ -static void account_page_dirtied(struct page *page, +static void folio_account_dirtied(struct folio *folio, struct address_space *mapping) { struct inode *inode = mapping->host; - trace_writeback_dirty_page(page, mapping); + trace_writeback_dirty_folio(folio, mapping); if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; + long nr = folio_nr_pages(folio); - inode_attach_wb(inode, page); + inode_attach_wb(inode, &folio->page); wb = inode_to_wb(inode); - __inc_lruvec_page_state(page, NR_FILE_DIRTY); - __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); - __inc_node_page_state(page, NR_DIRTIED); - inc_wb_stat(wb, WB_RECLAIMABLE); - inc_wb_stat(wb, WB_DIRTIED); - task_io_account_write(PAGE_SIZE); - current->nr_dirtied++; - __this_cpu_inc(bdp_ratelimits); + __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + __node_stat_mod_folio(folio, NR_DIRTIED, nr); + wb_stat_mod(wb, WB_RECLAIMABLE, nr); + wb_stat_mod(wb, WB_DIRTIED, nr); + task_io_account_write(nr * PAGE_SIZE); + current->nr_dirtied += nr; + __this_cpu_add(bdp_ratelimits, nr); - mem_cgroup_track_foreign_dirty(page, wb); + mem_cgroup_track_foreign_dirty(folio, wb); } } @@ -2469,130 +2470,152 @@ static void account_page_dirtied(struct page *page, * * Caller must hold lock_page_memcg(). */ -void account_page_cleaned(struct page *page, struct address_space *mapping, +void folio_account_cleaned(struct folio *folio, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_can_writeback(mapping)) { - dec_lruvec_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - dec_wb_stat(wb, WB_RECLAIMABLE); - task_io_account_cancelled_write(PAGE_SIZE); + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + task_io_account_cancelled_write(nr * PAGE_SIZE); } } /* - * Mark the page dirty, and set it dirty in the page cache, and mark the inode - * dirty. + * Mark the folio dirty, and set it dirty in the page cache, and mark + * the inode dirty. * - * If warn is true, then emit a warning if the page is not uptodate and has + * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * * The caller must hold lock_page_memcg(). */ -void __set_page_dirty(struct page *page, struct address_space *mapping, +void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) { unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), + if (folio->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); + folio_account_dirtied(folio, mapping); + __xa_set_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } -/* - * For address_spaces which do not use buffers. Just tag the page as dirty in - * the xarray. +/** + * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. + * @mapping: Address space this folio belongs to. + * @folio: Folio to be marked as dirty. + * + * Filesystems which do not use buffer heads should call this function + * from their set_page_dirty address space operation. It ignores the + * contents of folio_get_private(), so if the filesystem marks individual + * blocks as dirty, the filesystem should handle that itself. * - * This is also used when a single buffer is being dirtied: we want to set the - * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * This is also sometimes used by filesystems which use buffer_heads when + * a single buffer is being dirtied: we want to set the folio dirty in + * that case, but not all the buffers. This is a "bottom-up" dirtying, + * whereas __set_page_dirty_buffers() is a "top-down" dirtying. * - * The caller must ensure this doesn't race with truncation. Most will simply - * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and - * the pte lock held, which also locks out truncation. + * The caller must ensure this doesn't race with truncation. Most will + * simply hold the folio lock, but e.g. zap_pte_range() calls with the + * folio mapped and the pte lock held, which also locks out truncation. */ -int __set_page_dirty_nobuffers(struct page *page) +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); + folio_memcg_lock(folio); + if (folio_test_set_dirty(folio)) { + folio_memcg_unlock(folio); + return false; + } - if (!mapping) { - unlock_page_memcg(page); - return 1; - } - __set_page_dirty(page, mapping, !PagePrivate(page)); - unlock_page_memcg(page); + __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); + folio_memcg_unlock(folio); - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - } - return 1; + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } - unlock_page_memcg(page); - return 0; + return true; } -EXPORT_SYMBOL(__set_page_dirty_nobuffers); +EXPORT_SYMBOL(filemap_dirty_folio); -/* - * Call this whenever redirtying a page, to de-account the dirty counters - * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written - * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to - * systematic errors in balanced_dirty_ratelimit and the dirty pages position - * control. +/** + * folio_account_redirty - Manually account for redirtying a page. + * @folio: The folio which is being redirtied. + * + * Most filesystems should call folio_redirty_for_writepage() instead + * of this fuction. If your filesystem is doing writeback outside the + * context of a writeback_control(), it can call this when redirtying + * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED, + * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN, + * WB_WRITTEN) in long term. The mismatches will lead to systematic errors + * in balanced_dirty_ratelimit and the dirty pages position control. */ -void account_page_redirty(struct page *page) +void folio_account_redirty(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; + long nr = folio_nr_pages(folio); wb = unlocked_inode_to_wb_begin(inode, &cookie); - current->nr_dirtied--; - dec_node_page_state(page, NR_DIRTIED); - dec_wb_stat(wb, WB_DIRTIED); + current->nr_dirtied -= nr; + node_stat_mod_folio(folio, NR_DIRTIED, -nr); + wb_stat_mod(wb, WB_DIRTIED, -nr); unlocked_inode_to_wb_end(inode, &cookie); } } -EXPORT_SYMBOL(account_page_redirty); +EXPORT_SYMBOL(folio_account_redirty); -/* - * When a writepage implementation decides that it doesn't want to write this - * page for some reason, it should redirty the locked page via - * redirty_page_for_writepage() and it should then unlock the page and return 0 +/** + * folio_redirty_for_writepage - Decline to write a dirty folio. + * @wbc: The writeback control. + * @folio: The folio. + * + * When a writepage implementation decides that it doesn't want to write + * @folio for some reason, it should call this function, unlock @folio and + * return 0. + * + * Return: True if we redirtied the folio. False if someone else dirtied + * it first. */ -int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +bool folio_redirty_for_writepage(struct writeback_control *wbc, + struct folio *folio) { - int ret; + bool ret; + long nr = folio_nr_pages(folio); + + wbc->pages_skipped += nr; + ret = filemap_dirty_folio(folio->mapping, folio); + folio_account_redirty(folio); - wbc->pages_skipped++; - ret = __set_page_dirty_nobuffers(page); - account_page_redirty(page); return ret; } -EXPORT_SYMBOL(redirty_page_for_writepage); +EXPORT_SYMBOL(folio_redirty_for_writepage); -/* - * Dirty a page. +/** + * folio_mark_dirty - Mark a folio as being modified. + * @folio: The folio. * - * For pages with a mapping this should be done under the page lock for the - * benefit of asynchronous memory errors who prefer a consistent dirty state. - * This rule can be broken in some special cases, but should be better not to. + * For folios with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * + * Return: True if the folio was newly dirtied, false if it was already dirty. */ -int set_page_dirty(struct page *page) +bool folio_mark_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); - page = compound_head(page); if (likely(mapping)) { /* * readahead/lru_deactivate_page could remain @@ -2604,17 +2627,17 @@ int set_page_dirty(struct page *page) * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ - if (PageReclaim(page)) - ClearPageReclaim(page); - return mapping->a_ops->set_page_dirty(page); + if (folio_test_reclaim(folio)) + folio_clear_reclaim(folio); + return mapping->a_ops->set_page_dirty(&folio->page); } - if (!PageDirty(page)) { - if (!TestSetPageDirty(page)) - return 1; + if (!folio_test_dirty(folio)) { + if (!folio_test_set_dirty(folio)) + return true; } - return 0; + return false; } -EXPORT_SYMBOL(set_page_dirty); +EXPORT_SYMBOL(folio_mark_dirty); /* * set_page_dirty() is racy if the caller has no reference against @@ -2650,49 +2673,49 @@ EXPORT_SYMBOL(set_page_dirty_lock); * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ -void __cancel_dirty_page(struct page *page) +void __folio_cancel_dirty(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; - lock_page_memcg(page); + folio_memcg_lock(folio); wb = unlocked_inode_to_wb_begin(inode, &cookie); - if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, wb); + if (folio_test_clear_dirty(folio)) + folio_account_cleaned(folio, mapping, wb); unlocked_inode_to_wb_end(inode, &cookie); - unlock_page_memcg(page); + folio_memcg_unlock(folio); } else { - ClearPageDirty(page); + folio_clear_dirty(folio); } } -EXPORT_SYMBOL(__cancel_dirty_page); +EXPORT_SYMBOL(__folio_cancel_dirty); /* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. - * - * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the xarray so that a concurrent write-for-sync - * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage - * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and xarray dirty tag - * back into sync. - * - * This incoherency between the page's dirty flag and xarray tag is - * unfortunate, but it only exists while the page is locked. + * Clear a folio's dirty flag, while caring for dirty memory accounting. + * Returns true if the folio was previously dirty. + * + * This is for preparing to put the folio under writeout. We leave + * the folio tagged as dirty in the xarray so that a concurrent + * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. + * The ->writepage implementation will run either folio_start_writeback() + * or folio_mark_dirty(), at which stage we bring the folio's dirty flag + * and xarray dirty tag back into sync. + * + * This incoherency between the folio's dirty flag and xarray tag is + * unfortunate, but it only exists while the folio is locked. */ -int clear_page_dirty_for_io(struct page *page) +bool folio_clear_dirty_for_io(struct folio *folio) { - struct address_space *mapping = page_mapping(page); - int ret = 0; + struct address_space *mapping = folio_mapping(folio); + bool ret = false; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; @@ -2705,48 +2728,49 @@ int clear_page_dirty_for_io(struct page *page) * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to - * mark the whole page dirty if it was + * mark the whole folio dirty if it was * dirty in a pagetable. Only to then - * (c) clean the page again and return 1 to + * (c) clean the folio again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * - * Note! Normally the "set_page_dirty(page)" + * Note! Normally the "folio_mark_dirty(folio)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * - * We basically use the page "master dirty bit" + * We basically use the folio "master dirty bit" * as a serialization point for all the different * threads doing their things. */ - if (page_mkclean(page)) - set_page_dirty(page); + if (folio_mkclean(folio)) + folio_mark_dirty(folio); /* * We carefully synchronise fault handlers against - * installing a dirty pte and marking the page dirty + * installing a dirty pte and marking the folio dirty * at this point. We do this by having them hold the - * page lock while dirtying the page, and pages are + * page lock while dirtying the folio, and folios are * always locked coming in here, so we get the desired * exclusion. */ wb = unlocked_inode_to_wb_begin(inode, &cookie); - if (TestClearPageDirty(page)) { - dec_lruvec_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - dec_wb_stat(wb, WB_RECLAIMABLE); - ret = 1; + if (folio_test_clear_dirty(folio)) { + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + ret = true; } unlocked_inode_to_wb_end(inode, &cookie); return ret; } - return TestClearPageDirty(page); + return folio_test_clear_dirty(folio); } -EXPORT_SYMBOL(clear_page_dirty_for_io); +EXPORT_SYMBOL(folio_clear_dirty_for_io); static void wb_inode_writeback_start(struct bdi_writeback *wb) { @@ -2766,27 +2790,28 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); } -int test_clear_page_writeback(struct page *page) +bool __folio_end_writeback(struct folio *folio) { - struct address_space *mapping = page_mapping(page); - int ret; + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; - lock_page_memcg(page); + folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - ret = TestClearPageWriteback(page); + ret = folio_test_clear_writeback(folio); if (ret) { - __xa_clear_mark(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); - dec_wb_stat(wb, WB_WRITEBACK); - __wb_writeout_inc(wb); + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) wb_inode_writeback_end(wb); @@ -2799,32 +2824,34 @@ int test_clear_page_writeback(struct page *page) xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - ret = TestClearPageWriteback(page); + ret = folio_test_clear_writeback(folio); } if (ret) { - dec_lruvec_page_state(page, NR_WRITEBACK); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); - inc_node_page_state(page, NR_WRITTEN); + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + node_stat_mod_folio(folio, NR_WRITTEN, nr); } - unlock_page_memcg(page); + folio_memcg_unlock(folio); return ret; } -int __test_set_page_writeback(struct page *page, bool keep_write) +bool __folio_start_writeback(struct folio *folio, bool keep_write) { - struct address_space *mapping = page_mapping(page); - int ret, access_ret; + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; + int access_ret; - lock_page_memcg(page); + folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { - XA_STATE(xas, &mapping->i_pages, page_index(page)); + XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xas_lock_irqsave(&xas, flags); xas_load(&xas); - ret = TestSetPageWriteback(page); + ret = folio_test_set_writeback(folio); if (!ret) { bool on_wblist; @@ -2835,84 +2862,105 @@ int __test_set_page_writeback(struct page *page, bool keep_write) if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); - inc_wb_stat(wb, WB_WRITEBACK); + wb_stat_mod(wb, WB_WRITEBACK, nr); if (!on_wblist) wb_inode_writeback_start(wb); } /* - * We can come through here when swapping anonymous - * pages, so we don't necessarily have an inode to track - * for sync. + * We can come through here when swapping + * anonymous folios, so we don't necessarily + * have an inode to track for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); } - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { - ret = TestSetPageWriteback(page); + ret = folio_test_set_writeback(folio); } if (!ret) { - inc_lruvec_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); } - unlock_page_memcg(page); - access_ret = arch_make_page_accessible(page); + folio_memcg_unlock(folio); + access_ret = arch_make_folio_accessible(folio); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ - VM_BUG_ON_PAGE(access_ret != 0, page); + VM_BUG_ON_FOLIO(access_ret != 0, folio); return ret; - } -EXPORT_SYMBOL(__test_set_page_writeback); +EXPORT_SYMBOL(__folio_start_writeback); -/* - * Wait for a page to complete writeback +/** + * folio_wait_writeback - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. */ -void wait_on_page_writeback(struct page *page) +void folio_wait_writeback(struct folio *folio) { - while (PageWriteback(page)) { - trace_wait_on_page_writeback(page, page_mapping(page)); - wait_on_page_bit(page, PG_writeback); + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + folio_wait_bit(folio, PG_writeback); } } -EXPORT_SYMBOL_GPL(wait_on_page_writeback); +EXPORT_SYMBOL_GPL(folio_wait_writeback); -/* - * Wait for a page to complete writeback. Returns -EINTR if we get a - * fatal signal while waiting. +/** + * folio_wait_writeback_killable - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete or a fatal signal to arrive. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + * Return: 0 on success, -EINTR if we get a fatal signal while waiting. */ -int wait_on_page_writeback_killable(struct page *page) +int folio_wait_writeback_killable(struct folio *folio) { - while (PageWriteback(page)) { - trace_wait_on_page_writeback(page, page_mapping(page)); - if (wait_on_page_bit_killable(page, PG_writeback)) + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + if (folio_wait_bit_killable(folio, PG_writeback)) return -EINTR; } return 0; } -EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable); +EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); /** - * wait_for_stable_page() - wait for writeback to finish, if necessary. - * @page: The page to wait on. + * folio_wait_stable() - wait for writeback to finish, if necessary. + * @folio: The folio to wait on. + * + * This function determines if the given folio is related to a backing + * device that requires folio contents to be held stable during writeback. + * If so, then it will wait for any pending writeback to complete. * - * This function determines if the given page is related to a backing device - * that requires page contents to be held stable during writeback. If so, then - * it will wait for any pending writeback to complete. + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. */ -void wait_for_stable_page(struct page *page) +void folio_wait_stable(struct folio *folio) { - page = thp_head(page); - if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) - wait_on_page_writeback(page); + if (folio->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) + folio_wait_writeback(folio); } -EXPORT_SYMBOL_GPL(wait_for_stable_page); +EXPORT_SYMBOL_GPL(folio_wait_stable); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b37435c274cf..fee18ada46a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -724,7 +724,7 @@ static inline void free_the_page(struct page *page, unsigned int order) void free_compound_page(struct page *page) { - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(page_folio(page)); free_the_page(page, compound_order(page)); } @@ -1312,8 +1312,10 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) + if (compound) { ClearPageDoubleMap(page); + ClearPageHasHWPoisoned(page); + } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -5223,6 +5225,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(page_array && nr_pages - nr_populated == 0)) goto out; + /* Bulk allocator does not support memcg accounting. */ + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) + goto failed; + /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) goto failed; @@ -5400,6 +5406,18 @@ out: } EXPORT_SYMBOL(__alloc_pages); +struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask) +{ + struct page *page = __alloc_pages(gfp | __GFP_COMP, order, + preferred_nid, nodemask); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; +} +EXPORT_SYMBOL(__folio_alloc); + /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned * address cannot represent highmem pages. Use alloc_pages and then kmap if diff --git a/mm/page_ext.c b/mm/page_ext.c index dfb91653d359..2a52fd9ed464 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -269,7 +269,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) total_usage += table_size; return 0; } -#ifdef CONFIG_MEMORY_HOTPLUG + static void free_page_ext(void *addr) { if (is_vmalloc_addr(addr)) { @@ -374,8 +374,6 @@ static int __meminit page_ext_callback(struct notifier_block *self, return notifier_from_errno(ret); } -#endif - void __init page_ext_init(void) { unsigned long pfn; diff --git a/mm/page_io.c b/mm/page_io.c index c493ce9ebcf5..9725c7e1eeea 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -38,7 +38,7 @@ void end_swap_bio_write(struct bio *bio) * Also print a dire warning that things will go BAD (tm) * very quickly. * - * Also clear PG_reclaim to avoid rotate_reclaimable_page() + * Also clear PG_reclaim to avoid folio_rotate_reclaimable() */ set_page_dirty(page); pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", @@ -317,7 +317,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, * temporary failure if the system has limited * memory for allocating transmit buffers. * Mark the page dirty and avoid - * rotate_reclaimable_page but rate-limit the + * folio_rotate_reclaimable but rate-limit the * messages but do not flag PageError like * the normal direct-to-bio case as it could * be temporary. @@ -358,8 +358,6 @@ int swap_readpage(struct page *page, bool synchronous) struct bio *bio; int ret = 0; struct swap_info_struct *sis = page_swap_info(page); - blk_qc_t qc; - struct gendisk *disk; unsigned long pflags; VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); @@ -409,26 +407,24 @@ int swap_readpage(struct page *page, bool synchronous) bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; bio_add_page(bio, page, thp_size(page), 0); - - disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ if (synchronous) { - bio->bi_opf |= REQ_HIPRI; + bio->bi_opf |= REQ_POLLED; get_task_struct(current); bio->bi_private = current; } count_vm_event(PSWPIN); bio_get(bio); - qc = submit_bio(bio); + submit_bio(bio); while (synchronous) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio->bi_private)) break; - if (!blk_poll(disk->queue, qc, true)) + if (!bio_poll(bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/mm/page_owner.c b/mm/page_owner.c index 62402d22539b..d24ed221357c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -210,10 +210,10 @@ void __split_page_owner(struct page *page, unsigned int nr) } } -void __copy_page_owner(struct page *oldpage, struct page *newpage) +void __folio_copy_owner(struct folio *newfolio, struct folio *old) { - struct page_ext *old_ext = lookup_page_ext(oldpage); - struct page_ext *new_ext = lookup_page_ext(newpage); + struct page_ext *old_ext = lookup_page_ext(&old->page); + struct page_ext *new_ext = lookup_page_ext(&newfolio->page); struct page_owner *old_page_owner, *new_page_owner; if (unlikely(!old_ext || !new_ext)) @@ -231,11 +231,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; /* - * We don't clear the bit on the oldpage as it's going to be freed + * We don't clear the bit on the old folio as it's going to be freed * after migration. Until then, the info can be useful in case of * a bug, and the overall stats will be off a bit only temporarily. * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the oldpage to retain the info. But + * migration and then we want the old folio to retain the info. But * in that case we also don't need to explicitly clear the info from * the new page, which will be freed. */ diff --git a/mm/readahead.c b/mm/readahead.c index 41b75d76d36e..e71e719e36c9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -12,7 +12,6 @@ #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> -#include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> diff --git a/mm/rmap.c b/mm/rmap.c index 6aebd1747251..3a1059c284c3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -34,7 +34,7 @@ * mapping->private_lock (in __set_page_dirty_buffers) * lock_page_memcg move_lock (in __set_page_dirty_buffers) * i_pages lock (widely used) - * lruvec->lru_lock (in lock_page_lruvec_irq) + * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) @@ -981,7 +981,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) return true; } -int page_mkclean(struct page *page) +int folio_mkclean(struct folio *folio) { int cleaned = 0; struct address_space *mapping; @@ -991,20 +991,20 @@ int page_mkclean(struct page *page) .invalid_vma = invalid_mkclean_vma, }; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (!page_mapped(page)) + if (!folio_mapped(folio)) return 0; - mapping = page_mapping(page); + mapping = folio_mapping(folio); if (!mapping) return 0; - rmap_walk(page, &rwc); + rmap_walk(&folio->page, &rwc); return cleaned; } -EXPORT_SYMBOL_GPL(page_mkclean); +EXPORT_SYMBOL_GPL(folio_mkclean); /** * page_move_anon_rmap - move a page to our anon_vma diff --git a/mm/secretmem.c b/mm/secretmem.c index 1fea68b8d5a6..22b310adb53d 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -18,7 +18,6 @@ #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> -#include <linux/refcount.h> #include <uapi/linux/magic.h> @@ -41,11 +40,11 @@ module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); -static refcount_t secretmem_users; +static atomic_t secretmem_users; bool secretmem_active(void) { - return !!refcount_read(&secretmem_users); + return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) @@ -104,7 +103,7 @@ static const struct vm_operations_struct secretmem_vm_ops = { static int secretmem_release(struct inode *inode, struct file *file) { - refcount_dec(&secretmem_users); + atomic_dec(&secretmem_users); return 0; } @@ -204,6 +203,8 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; + if (atomic_read(&secretmem_users) < 0) + return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) @@ -217,8 +218,8 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) file->f_flags |= O_LARGEFILE; + atomic_inc(&secretmem_users); fd_install(fd, file); - refcount_inc(&secretmem_users); return fd; err_put_fd: diff --git a/mm/shmem.c b/mm/shmem.c index b5860f4a2738..17e344e26e73 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -59,7 +59,6 @@ static struct vfsmount *shm_mnt; #include <linux/backing-dev.h> #include <linux/shmem_fs.h> #include <linux/writeback.h> -#include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> @@ -710,7 +709,7 @@ static int shmem_add_to_page_cache(struct page *page, page->index = index; if (!PageSwapCache(page)) { - error = mem_cgroup_charge(page, charge_mm, gfp); + error = mem_cgroup_charge(page_folio(page), charge_mm, gfp); if (error) { if (PageTransHuge(page)) { count_vm_event(THP_FILE_FALLBACK); @@ -1637,6 +1636,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct page *oldpage, *newpage; + struct folio *old, *new; struct address_space *swap_mapping; swp_entry_t entry; pgoff_t swap_index; @@ -1673,7 +1673,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - mem_cgroup_migrate(oldpage, newpage); + old = page_folio(oldpage); + new = page_folio(newpage); + mem_cgroup_migrate(old, new); __inc_lruvec_page_state(newpage, NR_FILE_PAGES); __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } diff --git a/mm/slab.c b/mm/slab.c index d0f725637663..874b3f8fe80d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1095,7 +1095,7 @@ static int slab_offline_cpu(unsigned int cpu) return 0; } -#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +#if defined(CONFIG_NUMA) /* * Drains freelist for a node on each slab cache, used for memory hot-remove. * Returns -EBUSY if all objects cannot be drained so that the node is not @@ -1157,7 +1157,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self, out: return notifier_from_errno(ret); } -#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ +#endif /* CONFIG_NUMA */ /* * swap the static kmem_cache_node with kmalloced memory diff --git a/mm/slub.c b/mm/slub.c index 3d2025f7163b..d8f77346376d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1701,7 +1701,8 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, } static inline bool slab_free_freelist_hook(struct kmem_cache *s, - void **head, void **tail) + void **head, void **tail, + int *cnt) { void *object; @@ -1728,6 +1729,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, *head = object; if (!*tail) *tail = object; + } else { + /* + * Adjust the reconstructed freelist depth + * accordingly if object's reuse is delayed. + */ + --(*cnt); } } while (object != old_tail); @@ -3413,7 +3420,9 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long tid; - memcg_slab_free_hook(s, &head, 1); + /* memcg_slab_free_hook() is already called for bulk free. */ + if (!tail) + memcg_slab_free_hook(s, &head, 1); redo: /* * Determine the currently cpus per cpu slab. @@ -3480,7 +3489,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (slab_free_freelist_hook(s, &head, &tail)) + if (slab_free_freelist_hook(s, &head, &tail, &cnt)) do_slab_free(s, page, head, tail, cnt, addr); } @@ -4203,8 +4212,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) if (alloc_kmem_cache_cpus(s)) return 0; - free_kmem_cache_nodes(s); error: + __kmem_cache_release(s); return -EINVAL; } @@ -4880,13 +4889,15 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; err = sysfs_slab_add(s); - if (err) + if (err) { __kmem_cache_release(s); + return err; + } if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); - return err; + return 0; } void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) @@ -6108,9 +6119,14 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) struct kmem_cache *s = file_inode(filep)->i_private; unsigned long *obj_map; + if (!t) + return -ENOMEM; + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); - if (!obj_map) + if (!obj_map) { + seq_release_private(inode, filep); return -ENOMEM; + } if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) alloc = TRACK_ALLOC; @@ -6119,6 +6135,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { bitmap_free(obj_map); + seq_release_private(inode, filep); return -ENOMEM; } diff --git a/mm/swap.c b/mm/swap.c index af3cad4e5378..8ff9ba7cf2de 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -80,10 +80,11 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { static void __page_cache_release(struct page *page) { if (PageLRU(page)) { + struct folio *folio = page_folio(page); struct lruvec *lruvec; unsigned long flags; - lruvec = lock_page_lruvec_irqsave(page, &flags); + lruvec = folio_lruvec_lock_irqsave(folio, &flags); del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); @@ -94,7 +95,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(page_folio(page)); free_unref_page(page, 0); } @@ -188,12 +189,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(page); /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; - lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); (*move_fn)(page, lruvec); SetPageLRU(page); @@ -206,11 +208,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) { - if (!PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - add_page_to_lru_list_tail(page, lruvec); - __count_vm_events(PGROTATED, thp_nr_pages(page)); + struct folio *folio = page_folio(page); + + if (!folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + lruvec_add_folio_tail(lruvec, folio); + __count_vm_events(PGROTATED, folio_nr_pages(folio)); } } @@ -227,23 +231,23 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) } /* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. + * Writeback is about to end against a folio which has been marked for + * immediate reclaim. If it still appears to be reclaimable, move it + * to the tail of the inactive list. * - * rotate_reclaimable_page() must disable IRQs, to prevent nasty races. + * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ -void rotate_reclaimable_page(struct page *page) +void folio_rotate_reclaimable(struct folio *folio) { - if (!PageLocked(page) && !PageDirty(page) && - !PageUnevictable(page) && PageLRU(page)) { + if (!folio_test_locked(folio) && !folio_test_dirty(folio) && + !folio_test_unevictable(folio) && folio_test_lru(folio)) { struct pagevec *pvec; unsigned long flags; - get_page(page); + folio_get(folio); local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); - if (pagevec_add_and_need_flush(pvec, page)) + if (pagevec_add_and_need_flush(pvec, &folio->page)) pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } @@ -289,21 +293,21 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) } while ((lruvec = parent_lruvec(lruvec))); } -void lru_note_cost_page(struct page *page) +void lru_note_cost_folio(struct folio *folio) { - lru_note_cost(mem_cgroup_page_lruvec(page), - page_is_file_lru(page), thp_nr_pages(page)); + lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), + folio_nr_pages(folio)); } -static void __activate_page(struct page *page, struct lruvec *lruvec) +static void __folio_activate(struct folio *folio, struct lruvec *lruvec) { - if (!PageActive(page) && !PageUnevictable(page)) { - int nr_pages = thp_nr_pages(page); + if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); - del_page_from_lru_list(page, lruvec); - SetPageActive(page); - add_page_to_lru_list(page, lruvec); - trace_mm_lru_activate(page); + lruvec_del_folio(lruvec, folio); + folio_set_active(folio); + lruvec_add_folio(lruvec, folio); + trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, @@ -312,6 +316,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec) } #ifdef CONFIG_SMP +static void __activate_page(struct page *page, struct lruvec *lruvec) +{ + return __folio_activate(page_folio(page), lruvec); +} + static void activate_page_drain(int cpu) { struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); @@ -325,16 +334,16 @@ static bool need_activate_page_drain(int cpu) return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } -static void activate_page(struct page *page) +static void folio_activate(struct folio *folio) { - page = compound_head(page); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + if (folio_test_lru(folio) && !folio_test_active(folio) && + !folio_test_unevictable(folio)) { struct pagevec *pvec; + folio_get(folio); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.activate_page); - get_page(page); - if (pagevec_add_and_need_flush(pvec, page)) + if (pagevec_add_and_need_flush(pvec, &folio->page)) pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } @@ -345,21 +354,20 @@ static inline void activate_page_drain(int cpu) { } -static void activate_page(struct page *page) +static void folio_activate(struct folio *folio) { struct lruvec *lruvec; - page = compound_head(page); - if (TestClearPageLRU(page)) { - lruvec = lock_page_lruvec_irq(page); - __activate_page(page, lruvec); + if (folio_test_clear_lru(folio)) { + lruvec = folio_lruvec_lock_irq(folio); + __folio_activate(folio, lruvec); unlock_page_lruvec_irq(lruvec); - SetPageLRU(page); + folio_set_lru(folio); } } #endif -static void __lru_cache_activate_page(struct page *page) +static void __lru_cache_activate_folio(struct folio *folio) { struct pagevec *pvec; int i; @@ -380,8 +388,8 @@ static void __lru_cache_activate_page(struct page *page) for (i = pagevec_count(pvec) - 1; i >= 0; i--) { struct page *pagevec_page = pvec->pages[i]; - if (pagevec_page == page) { - SetPageActive(page); + if (pagevec_page == &folio->page) { + folio_set_active(folio); break; } } @@ -399,61 +407,59 @@ static void __lru_cache_activate_page(struct page *page) * When a newly allocated page is not yet visible, so safe for non-atomic ops, * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ -void mark_page_accessed(struct page *page) +void folio_mark_accessed(struct folio *folio) { - page = compound_head(page); - - if (!PageReferenced(page)) { - SetPageReferenced(page); - } else if (PageUnevictable(page)) { + if (!folio_test_referenced(folio)) { + folio_set_referenced(folio); + } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * evictable page accessed has no effect. */ - } else if (!PageActive(page)) { + } else if (!folio_test_active(folio)) { /* * If the page is on the LRU, queue it for activation via * lru_pvecs.activate_page. Otherwise, assume the page is on a * pagevec, mark it active and it'll be moved to the active * LRU on the next drain. */ - if (PageLRU(page)) - activate_page(page); + if (folio_test_lru(folio)) + folio_activate(folio); else - __lru_cache_activate_page(page); - ClearPageReferenced(page); - workingset_activation(page); + __lru_cache_activate_folio(folio); + folio_clear_referenced(folio); + workingset_activation(folio); } - if (page_is_idle(page)) - clear_page_idle(page); + if (folio_test_idle(folio)) + folio_clear_idle(folio); } -EXPORT_SYMBOL(mark_page_accessed); +EXPORT_SYMBOL(folio_mark_accessed); /** - * lru_cache_add - add a page to a page list - * @page: the page to be added to the LRU. + * folio_add_lru - Add a folio to an LRU list. + * @folio: The folio to be added to the LRU. * - * Queue the page for addition to the LRU via pagevec. The decision on whether + * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of lru_cache_add() - * have the page added to the active list using mark_page_accessed(). + * pagevec is drained. This gives a chance for the caller of folio_add_lru() + * have the folio added to the active list using folio_mark_accessed(). */ -void lru_cache_add(struct page *page) +void folio_add_lru(struct folio *folio) { struct pagevec *pvec; - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - get_page(page); + folio_get(folio); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (pagevec_add_and_need_flush(pvec, page)) + if (pagevec_add_and_need_flush(pvec, &folio->page)) __pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock); } -EXPORT_SYMBOL(lru_cache_add); +EXPORT_SYMBOL(folio_add_lru); /** * lru_cache_add_inactive_or_unevictable @@ -888,11 +894,12 @@ void release_pages(struct page **pages, int nr) int i; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; - unsigned long flags; + unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); /* * Make sure the IRQ-safe lock-holding time does not get @@ -904,7 +911,7 @@ void release_pages(struct page **pages, int nr) lruvec = NULL; } - page = compound_head(page); + page = &folio->page; if (is_huge_zero_page(page)) continue; @@ -943,7 +950,7 @@ void release_pages(struct page **pages, int nr) if (PageLRU(page)) { struct lruvec *prev_lruvec = lruvec; - lruvec = relock_page_lruvec_irqsave(page, lruvec, + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); if (prev_lruvec != lruvec) lock_batch = 0; @@ -985,17 +992,18 @@ void __pagevec_release(struct pagevec *pvec) } EXPORT_SYMBOL(__pagevec_release); -static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) +static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) { - int was_unevictable = TestClearPageUnevictable(page); - int nr_pages = thp_nr_pages(page); + int was_unevictable = folio_test_clear_unevictable(folio); + long nr_pages = folio_nr_pages(folio); - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* - * Page becomes evictable in two ways: + * A folio becomes evictable in two ways: * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. - * 2) Before acquiring LRU lock to put the page to correct LRU and then + * 2) Before acquiring LRU lock to put the folio on the correct LRU + * and then * a) do PageLRU check with lock [check_move_unevictable_pages] * b) do PageLRU check before lock [clear_page_mlock] * @@ -1004,35 +1012,36 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) * * #0: __pagevec_lru_add_fn #1: clear_page_mlock * - * SetPageLRU() TestClearPageMlocked() + * folio_set_lru() folio_test_clear_mlocked() * smp_mb() // explicit ordering // above provides strict * // ordering - * PageMlocked() PageLRU() + * folio_test_mlocked() folio_test_lru() * * - * if '#1' does not observe setting of PG_lru by '#0' and fails - * isolation, the explicit barrier will make sure that page_evictable - * check will put the page in correct LRU. Without smp_mb(), SetPageLRU - * can be reordered after PageMlocked check and can make '#1' to fail - * the isolation of the page whose Mlocked bit is cleared (#0 is also - * looking at the same page) and the evictable page will be stranded - * in an unevictable LRU. + * if '#1' does not observe setting of PG_lru by '#0' and + * fails isolation, the explicit barrier will make sure that + * folio_evictable check will put the folio on the correct + * LRU. Without smp_mb(), folio_set_lru() can be reordered + * after folio_test_mlocked() check and can make '#1' fail the + * isolation of the folio whose mlocked bit is cleared (#0 is + * also looking at the same folio) and the evictable folio will + * be stranded on an unevictable LRU. */ - SetPageLRU(page); + folio_set_lru(folio); smp_mb__after_atomic(); - if (page_evictable(page)) { + if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { - ClearPageActive(page); - SetPageUnevictable(page); + folio_clear_active(folio); + folio_set_unevictable(folio); if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } - add_page_to_lru_list(page, lruvec); - trace_mm_lru_insertion(page); + lruvec_add_folio(lruvec, folio); + trace_mm_lru_insertion(folio); } /* @@ -1046,10 +1055,10 @@ void __pagevec_lru_add(struct pagevec *pvec) unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(pvec->pages[i]); - lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); - __pagevec_lru_add_fn(page, lruvec); + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + __pagevec_lru_add_fn(folio, lruvec); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); diff --git a/mm/swap_state.c b/mm/swap_state.c index bc7cee6b2ec5..8d4104242100 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(page, shadow); + workingset_refault(page_folio(page), shadow); /* Caller will initiate read into locked page */ lru_cache_add(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 22d10f713848..41c9e92f1f00 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -18,7 +18,7 @@ #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/shmem_fs.h> -#include <linux/blkdev.h> +#include <linux/blk-cgroup.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> @@ -3534,13 +3534,13 @@ struct swap_info_struct *page_swap_info(struct page *page) } /* - * out-of-line __page_file_ methods to avoid include hell. + * out-of-line methods to avoid include hell. */ -struct address_space *__page_file_mapping(struct page *page) +struct address_space *swapcache_mapping(struct folio *folio) { - return page_swap_info(page)->swap_file->f_mapping; + return page_swap_info(&folio->page)->swap_file->f_mapping; } -EXPORT_SYMBOL_GPL(__page_file_mapping); +EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __page_file_index(struct page *page) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7a9008415534..36e5f6ab976f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -164,7 +164,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL)) goto out_release; ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, diff --git a/mm/util.c b/mm/util.c index bacabe446906..e58151a61255 100644 --- a/mm/util.c +++ b/mm/util.c @@ -654,81 +654,78 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) } EXPORT_SYMBOL(kvrealloc); -static inline void *__page_rmapping(struct page *page) -{ - unsigned long mapping; - - mapping = (unsigned long)page->mapping; - mapping &= ~PAGE_MAPPING_FLAGS; - - return (void *)mapping; -} - /* Neutral page->mapping pointer to address_space or anon_vma or other */ void *page_rmapping(struct page *page) { - page = compound_head(page); - return __page_rmapping(page); + return folio_raw_mapping(page_folio(page)); } -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped. +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. */ -bool page_mapped(struct page *page) +bool folio_mapped(struct folio *folio) { - int i; + long i, nr; - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - page = compound_head(page); - if (atomic_read(compound_mapcount_ptr(page)) >= 0) + if (folio_test_single(folio)) + return atomic_read(&folio->_mapcount) >= 0; + if (atomic_read(folio_mapcount_ptr(folio)) >= 0) return true; - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) return false; - for (i = 0; i < compound_nr(page); i++) { - if (atomic_read(&page[i]._mapcount) >= 0) + + nr = folio_nr_pages(folio); + for (i = 0; i < nr; i++) { + if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0) return true; } return false; } -EXPORT_SYMBOL(page_mapped); +EXPORT_SYMBOL(folio_mapped); struct anon_vma *page_anon_vma(struct page *page) { - unsigned long mapping; + struct folio *folio = page_folio(page); + unsigned long mapping = (unsigned long)folio->mapping; - page = compound_head(page); - mapping = (unsigned long)page->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; - return __page_rmapping(page); + return (void *)(mapping - PAGE_MAPPING_ANON); } -struct address_space *page_mapping(struct page *page) +/** + * folio_mapping - Find the mapping where this folio is stored. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the swap mapping + * this page is stored in (which is different from the mapping for the + * swap file or swap device where the data is stored). + * + * You can call this for folios which aren't in the swap cache or page + * cache and it will return NULL. + */ +struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; - page = compound_head(page); - /* This happens if someone calls flush_dcache_page on slab page */ - if (unlikely(PageSlab(page))) + if (unlikely(folio_test_slab(folio))) return NULL; - if (unlikely(PageSwapCache(page))) { - swp_entry_t entry; + if (unlikely(folio_test_swapcache(folio))) + return swap_address_space(folio_swap_entry(folio)); - entry.val = page_private(page); - return swap_address_space(entry); - } - - mapping = page->mapping; + mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_ANON) return NULL; return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); } -EXPORT_SYMBOL(page_mapping); +EXPORT_SYMBOL(folio_mapping); /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) @@ -750,13 +747,26 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); -void copy_huge_page(struct page *dst, struct page *src) +/** + * folio_copy - Copy the contents of one folio to another. + * @dst: Folio to copy to. + * @src: Folio to copy from. + * + * The bytes in the folio represented by @src are copied to @dst. + * Assumes the caller has validated that @dst is at least as large as @src. + * Can be called in atomic context for order-0 folios, but if the folio is + * larger, it may sleep. + */ +void folio_copy(struct folio *dst, struct folio *src) { - unsigned i, nr = compound_nr(src); + long i = 0; + long nr = folio_nr_pages(src); - for (i = 0; i < nr; i++) { + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; cond_resched(); - copy_highpage(nth_page(dst, i), nth_page(src, i)); } } @@ -1079,3 +1089,14 @@ void page_offline_end(void) up_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_end); + +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +void flush_dcache_folio(struct folio *folio) +{ + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + flush_dcache_page(folio_page(folio, i)); +} +EXPORT_SYMBOL(flush_dcache_folio); +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d77830ff604c..e8a807c78110 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2816,6 +2816,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; + struct page *page; + int i; /* * For order-0 pages we make use of bulk allocator, if @@ -2823,7 +2825,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * to fails, fallback to a single page allocator that is * more permissive. */ - if (!order) { + if (!order && nid != NUMA_NO_NODE) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2848,7 +2850,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else + } else if (order) /* * Compound pages required for remap_vmalloc_page if * high-order pages. @@ -2856,11 +2858,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, gfp |= __GFP_COMP; /* High-order pages or fallback path if "bulk" fails. */ - while (nr_allocated < nr_pages) { - struct page *page; - int i; - page = alloc_pages_node(nid, gfp, order); + while (nr_allocated < nr_pages) { + if (nid == NUMA_NO_NODE) + page = alloc_pages(gfp, order); + else + page = alloc_pages_node(nid, gfp, order); if (unlikely(!page)) break; diff --git a/mm/vmscan.c b/mm/vmscan.c index 74296c2d1fed..306229c4313f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2090,6 +2090,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, */ int isolate_lru_page(struct page *page) { + struct folio *folio = page_folio(page); int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); @@ -2099,7 +2100,7 @@ int isolate_lru_page(struct page *page) struct lruvec *lruvec; get_page(page); - lruvec = lock_page_lruvec_irq(page); + lruvec = folio_lruvec_lock_irq(folio); del_page_from_lru_list(page, lruvec); unlock_page_lruvec_irq(lruvec); ret = 0; @@ -2199,7 +2200,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration). */ - VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page); + VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page); add_page_to_lru_list(page, lruvec); nr_pages = thp_nr_pages(page); nr_moved += nr_pages; @@ -4665,6 +4666,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; + struct folio *folio = page_folio(page); int nr_pages; if (PageTransTail(page)) @@ -4677,7 +4679,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) if (!TestClearPageLRU(page)) continue; - lruvec = relock_page_lruvec_irq(page, lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); if (page_evictable(page) && PageUnevictable(page)) { del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); diff --git a/mm/workingset.c b/mm/workingset.c index d5b81e4f4cbe..109ab978251a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -273,17 +273,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) } /** - * workingset_refault - evaluate the refault of a previously evicted page - * @page: the freshly allocated replacement page - * @shadow: shadow entry of the evicted page + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously - * evicted page in the context of the node and the memcg whose memory + * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ -void workingset_refault(struct page *page, void *shadow) +void workingset_refault(struct folio *folio, void *shadow) { - bool file = page_is_file_lru(page); + bool file = folio_is_file_lru(folio); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; @@ -295,16 +295,17 @@ void workingset_refault(struct page *page, void *shadow) unsigned long refault; bool workingset; int memcgid; + long nr; unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* * Look up the memcg associated with the stored ID. It might - * have been deleted since the page's eviction. + * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled - * for a new cgroup that refaults a shared page. This is + * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging @@ -340,17 +341,18 @@ void workingset_refault(struct page *page, void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; /* - * The activation decision for this page is made at the level + * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order - * during page reclaim is being determined. + * during folio reclaim is being determined. * - * However, the cgroup that will own the page is the one that + * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. */ - memcg = page_memcg(page); + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); - inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); mem_cgroup_flush_stats(); /* @@ -376,16 +378,16 @@ void workingset_refault(struct page *page, void *shadow) if (refault_distance > workingset_size) goto out; - SetPageActive(page); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); + folio_set_active(folio); + workingset_age_nonresident(lruvec, nr); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); - /* Page was active prior to eviction */ + /* Folio was active prior to eviction */ if (workingset) { - SetPageWorkingset(page); + folio_set_workingset(folio); /* XXX: Move to lru_cache_add() when it supports new vs putback */ - lru_note_cost_page(page); - inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); + lru_note_cost_folio(folio); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } out: rcu_read_unlock(); @@ -393,12 +395,11 @@ out: /** * workingset_activation - note a page activation - * @page: page that is being activated + * @folio: Folio that is being activated. */ -void workingset_activation(struct page *page) +void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); /* @@ -408,11 +409,10 @@ void workingset_activation(struct page *page) * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ - memcg = page_memcg_rcu(page); + memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_page_lruvec(page); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); out: rcu_read_unlock(); } diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 1669744304c5..17687848daec 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1560,10 +1560,14 @@ int batadv_bla_init(struct batadv_priv *bat_priv) return 0; bat_priv->bla.claim_hash = batadv_hash_new(128); - bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.claim_hash) + return -ENOMEM; - if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash) + bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.backbone_hash) { + batadv_hash_destroy(bat_priv->bla.claim_hash); return -ENOMEM; + } batadv_hash_set_lock_class(bat_priv->bla.claim_hash, &batadv_claim_hash_lock_class_key); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3ddd66e4c29e..5207cd8d6ad8 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -190,29 +190,41 @@ int batadv_mesh_init(struct net_device *soft_iface) bat_priv->gw.generation = 0; - ret = batadv_v_mesh_init(bat_priv); - if (ret < 0) - goto err; - ret = batadv_originator_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_orig; + } ret = batadv_tt_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_tt; + } + + ret = batadv_v_mesh_init(bat_priv); + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_v; + } ret = batadv_bla_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_bla; + } ret = batadv_dat_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_dat; + } ret = batadv_nc_mesh_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_nc; + } batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); @@ -222,8 +234,20 @@ int batadv_mesh_init(struct net_device *soft_iface) return 0; -err: - batadv_mesh_free(soft_iface); +err_nc: + batadv_dat_free(bat_priv); +err_dat: + batadv_bla_free(bat_priv); +err_bla: + batadv_v_mesh_free(bat_priv); +err_v: + batadv_tt_free(bat_priv); +err_tt: + batadv_originator_free(bat_priv); +err_orig: + batadv_purge_outstanding_packets(bat_priv, NULL); + atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); + return ret; } diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 9f06132e007d..0a7f1d36a6a8 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -152,8 +152,10 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) &batadv_nc_coding_hash_lock_class_key); bat_priv->nc.decoding_hash = batadv_hash_new(128); - if (!bat_priv->nc.decoding_hash) + if (!bat_priv->nc.decoding_hash) { + batadv_hash_destroy(bat_priv->nc.coding_hash); goto err; + } batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, &batadv_nc_decoding_hash_lock_class_key); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index e0b3dace2020..4b7ad6684bc4 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -4162,8 +4162,10 @@ int batadv_tt_init(struct batadv_priv *bat_priv) return ret; ret = batadv_tt_global_init(bat_priv); - if (ret < 0) + if (ret < 0) { + batadv_tt_local_table_free(bat_priv); return ret; + } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, batadv_tt_tvlv_unicast_handler_v1, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6c58fc14d2cb..5c6c4305ed23 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1666,7 +1666,8 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + - nla_total_size(sizeof(struct br_mcast_stats)) + + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e8136db44462..37ca76406f1e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1125,9 +1125,7 @@ static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brm static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx) { - /* use the RFC default of 2 for QRV */ - return 2 * brmctx->multicast_query_interval + - brmctx->multicast_query_response_interval; + return brmctx->multicast_membership_interval; } static inline bool diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 83d1798dfbb4..ba045f35114d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -926,7 +926,9 @@ static int translate_table(struct net *net, const char *name, return -ENOMEM; for_each_possible_cpu(i) { newinfo->chainstack[i] = - vmalloc(array_size(udc_cnt, sizeof(*(newinfo->chainstack[0])))); + vmalloc_node(array_size(udc_cnt, + sizeof(*(newinfo->chainstack[0]))), + cpu_to_node(i)); if (!newinfo->chainstack[i]) { while (i) vfree(newinfo->chainstack[--i]); diff --git a/net/can/isotp.c b/net/can/isotp.c index caaa532ece94..df6968b28bf4 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -121,7 +121,7 @@ enum { struct tpcon { int idx; int len; - u8 state; + u32 state; u8 bs; u8 sn; u8 ll_dl; @@ -848,6 +848,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); + u32 old_state = so->tx.state; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; @@ -860,45 +861,55 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return -EADDRNOTAVAIL; /* we do not support multiple buffers - for now */ - if (so->tx.state != ISOTP_IDLE || wq_has_sleeper(&so->wait)) { - if (msg->msg_flags & MSG_DONTWAIT) - return -EAGAIN; + if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE || + wq_has_sleeper(&so->wait)) { + if (msg->msg_flags & MSG_DONTWAIT) { + err = -EAGAIN; + goto err_out; + } /* wait for complete transmission of current pdu */ - wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + if (err) + goto err_out; } - if (!size || size > MAX_MSG_LENGTH) - return -EINVAL; + if (!size || size > MAX_MSG_LENGTH) { + err = -EINVAL; + goto err_out; + } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) && - (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) - return -EINVAL; + (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { + err = -EINVAL; + goto err_out; + } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) - return err; + goto err_out; dev = dev_get_by_index(sock_net(sk), so->ifindex); - if (!dev) - return -ENXIO; + if (!dev) { + err = -ENXIO; + goto err_out; + } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); - return err; + goto err_out; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; - so->tx.state = ISOTP_SENDING; so->tx.len = size; so->tx.idx = 0; @@ -954,15 +965,25 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); - return err; + goto err_out; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + + if (sk->sk_err) + return -sk->sk_err; } return size; + +err_out: + so->tx.state = old_state; + if (so->tx.state == ISOTP_IDLE) + wake_up_interruptible(&so->wait); + + return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index f6df20808f5e..16af1a7f80f6 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -330,6 +330,7 @@ int j1939_session_activate(struct j1939_session *session); void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec); void j1939_session_timers_cancel(struct j1939_session *session); +#define J1939_MIN_TP_PACKET_SIZE 9 #define J1939_MAX_TP_PACKET_SIZE (7 * 0xff) #define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 08c8606cfd9c..9bc55ecb37f9 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -249,11 +249,14 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) struct j1939_priv *priv, *priv_new; int ret; - priv = j1939_priv_get_by_ndev(ndev); + spin_lock(&j1939_netdev_lock); + priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); + spin_unlock(&j1939_netdev_lock); return priv; } + spin_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) @@ -269,10 +272,10 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) /* Someone was faster than us, use their priv and roll * back our's. */ + kref_get(&priv_new->rx_kref); spin_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); - kref_get(&priv_new->rx_kref); return priv_new; } j1939_priv_set(ndev, priv); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index bb5c4b8979be..6c0a0ebdd024 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1237,12 +1237,11 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) session->err = -ETIME; j1939_session_deactivate(session); } else { - netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", - __func__, session); - j1939_session_list_lock(session->priv); if (session->state >= J1939_SESSION_ACTIVE && session->state < J1939_SESSION_ACTIVE_MAX) { + netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", + __func__, session); j1939_session_get(session); hrtimer_start(&session->rxtimer, ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), @@ -1609,6 +1608,8 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, abort = J1939_XTP_ABORT_FAULT; else if (len > priv->tp_max_packet_size) abort = J1939_XTP_ABORT_RESOURCE; + else if (len < J1939_MIN_TP_PACKET_SIZE) + abort = J1939_XTP_ABORT_FAULT; } if (abort != J1939_XTP_NO_ABORT) { @@ -1789,6 +1790,7 @@ static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, static void j1939_xtp_rx_dat_one(struct j1939_session *session, struct sk_buff *skb) { + enum j1939_xtp_abort abort = J1939_XTP_ABORT_FAULT; struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *skcb, *se_skcb; struct sk_buff *se_skb = NULL; @@ -1803,9 +1805,11 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, skcb = j1939_skb_to_cb(skb); dat = skb->data; - if (skb->len <= 1) + if (skb->len != 8) { /* makes no sense */ + abort = J1939_XTP_ABORT_UNEXPECTED_DATA; goto out_session_cancel; + } switch (session->last_cmd) { case 0xff: @@ -1904,7 +1908,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, out_session_cancel: kfree_skb(se_skb); j1939_session_timers_cancel(session); - j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); + j1939_session_cancel(session, abort); j1939_session_put(session); } diff --git a/net/core/dev.c b/net/core/dev.c index 7ee9fecd3aff..eb3a366bf212 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3163,6 +3163,12 @@ static u16 skb_tx_hash(const struct net_device *dev, qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; + if (unlikely(!qcount)) { + net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", + sb_dev->name, qoffset, tc); + qoffset = 0; + qcount = dev->real_num_tx_queues; + } } if (skb_rx_queue_recorded(skb)) { @@ -3906,7 +3912,8 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb->ip_summed == CHECKSUM_NONE) + skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb)); skb_dst_force(skb); netif_rx_ni(skb); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index eab5fc88a002..d8b9dbabd4a4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -77,8 +77,8 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - seq_printf(seq, "%9s: %16llu %12llu %4llu %6llu %4llu %5llu %10llu %9llu " - "%16llu %12llu %4llu %6llu %4llu %5llu %7llu %10llu\n", + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, @@ -103,11 +103,11 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) static int dev_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "Interface| Receive " - " | Transmit\n" - " | bytes packets errs drop fifo frame " - "compressed multicast| bytes packets errs " - " drop fifo colls carrier compressed\n"); + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); else dev_seq_printf_stats(seq, v); return 0; @@ -259,14 +259,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v) struct packet_type *pt = v; if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); + seq_puts(seq, "Type Device Function\n"); else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-9s %ps\n", + seq_printf(seq, " %-8s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } @@ -327,14 +327,12 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) struct netdev_hw_addr *ha; struct net_device *dev = v; - if (v == SEQ_START_TOKEN) { - seq_puts(seq, "Ifindex Interface Refcount Global_use Address\n"); + if (v == SEQ_START_TOKEN) return 0; - } netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - seq_printf(seq, "%-7d %-9s %-8d %-10d %*phN\n", + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", dev->ifindex, dev->name, ha->refcount, ha->global_use, (int)dev->addr_len, ha->addr); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f6197774048b..b2e49eb7001d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1973,9 +1973,9 @@ int netdev_register_kobject(struct net_device *ndev) int netdev_change_owner(struct net_device *ndev, const struct net *net_old, const struct net *net_new) { + kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; + kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; struct device *dev = &ndev->dev; - kuid_t old_uid, new_uid; - kgid_t old_gid, new_gid; int error; net_ns_get_ownership(net_old, &old_uid, &old_gid); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 972c8cb303a5..8ccce85562a1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -5262,7 +5262,7 @@ nla_put_failure: static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { - size_t size = 0; + size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2170bea2c7de..fe9358437380 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -80,6 +80,7 @@ #include <linux/indirect_call_wrapper.h> #include "datagram.h" +#include "sock_destructor.h" struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; @@ -1804,30 +1805,39 @@ EXPORT_SYMBOL(skb_realloc_headroom); struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) { int delta = headroom - skb_headroom(skb); + int osize = skb_end_offset(skb); + struct sock *sk = skb->sk; if (WARN_ONCE(delta <= 0, "%s is expecting an increase in the headroom", __func__)) return skb; - /* pskb_expand_head() might crash, if skb is shared */ - if (skb_shared(skb)) { + delta = SKB_DATA_ALIGN(delta); + /* pskb_expand_head() might crash, if skb is shared. */ + if (skb_shared(skb) || !is_skb_wmem(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); - if (likely(nskb)) { - if (skb->sk) - skb_set_owner_w(nskb, skb->sk); - consume_skb(skb); - } else { - kfree_skb(skb); - } + if (unlikely(!nskb)) + goto fail; + + if (sk) + skb_set_owner_w(nskb, sk); + consume_skb(skb); skb = nskb; } - if (skb && - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { - kfree_skb(skb); - skb = NULL; + if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) + goto fail; + + if (sk && is_skb_wmem(skb)) { + delta = skb_end_offset(skb) - osize; + refcount_add(delta, &sk->sk_wmem_alloc); + skb->truesize += delta; } return skb; + +fail: + kfree_skb(skb); + return NULL; } EXPORT_SYMBOL(skb_expand_head); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2d6249b28928..a86ef7e844f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -474,6 +474,20 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); +bool sk_msg_is_readable(struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} +EXPORT_SYMBOL_GPL(sk_msg_is_readable); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h new file mode 100644 index 000000000000..2f396e6bfba5 --- /dev/null +++ b/net/core/sock_destructor.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_SOCK_DESTRUCTOR_H +#define _NET_CORE_SOCK_DESTRUCTOR_H +#include <net/tcp.h> + +static inline bool is_skb_wmem(const struct sk_buff *skb) +{ + return skb->destructor == sock_wfree || + skb->destructor == __sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree); +} +#endif diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c8496c1142c9..5f88526ad61c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -419,7 +419,7 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = &long_one, - .extra2 = &long_max, + .extra2 = &bpf_jit_limit_max, }, #endif { diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 548285539752..d8ee15f1c7a9 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -101,8 +101,6 @@ config NET_DSA_TAG_RTL4_A config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches, using NPI port" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) select PACKING help Say Y or M if you want to enable NPI tagging for the Ocelot switches @@ -114,8 +112,6 @@ config NET_DSA_TAG_OCELOT config NET_DSA_TAG_OCELOT_8021Q tristate "Tag driver for Ocelot family of switches, using VLAN" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) help Say Y or M if you want to enable support for tagging frames with a custom VLAN-based header. Frames that require timestamping, such as @@ -138,7 +134,6 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" - depends on NET_DSA_SJA1105 || !NET_DSA_SJA1105 select PACKING help Say Y or M if you want to enable support for tagging frames with the diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b29262eee00b..e9911b18bdbf 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -170,7 +170,7 @@ void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num) /* Check if the bridge is still in use, otherwise it is time * to clean it up so we can reuse this bridge_num later. */ - if (!dsa_bridge_num_find(bridge_dev)) + if (dsa_bridge_num_find(bridge_dev) < 0) clear_bit(bridge_num, &dsa_fwd_offloading_bridges); } @@ -811,7 +811,9 @@ static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) if (!dsa_is_cpu_port(ds, port)) continue; + rtnl_lock(); err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + rtnl_unlock(); if (err) { dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", tag_ops->name, ERR_PTR(err)); @@ -1372,12 +1374,15 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, for_each_available_child_of_node(ports, port) { err = of_property_read_u32(port, "reg", ®); - if (err) + if (err) { + of_node_put(port); goto out_put_node; + } if (reg >= ds->num_ports) { dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%zu)\n", port, reg, ds->num_ports); + of_node_put(port); err = -EINVAL; goto out_put_node; } @@ -1385,8 +1390,10 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, dp = dsa_to_port(ds, reg); err = dsa_port_parse_of(dp, port); - if (err) + if (err) { + of_node_put(port); goto out_put_node; + } } out_put_node: diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 1c797ec8e2c2..6466d0539af9 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -168,7 +168,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (extack._msg) dev_err(ds->dev, "port %d: %s\n", info->port, extack._msg); - if (err && err != EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) return err; } diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 77d0ce89ab77..b3da4b2ea11c 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -45,6 +45,7 @@ * 6 6 2 2 4 2 N */ +#include <linux/dsa/mv88e6xxx.h> #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> @@ -129,12 +130,9 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 tag_dev, tag_port; enum dsa_cmd cmd; u8 *dsa_header; - u16 pvid = 0; - int err; if (skb->offload_fwd_mark) { struct dsa_switch_tree *dst = dp->ds->dst; - struct net_device *br = dp->bridge_dev; cmd = DSA_CMD_FORWARD; @@ -144,19 +142,6 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, */ tag_dev = dst->last_switch + 1 + dp->bridge_num; tag_port = 0; - - /* If we are offloading forwarding for a VLAN-unaware bridge, - * inject packets to hardware using the bridge's pvid, since - * that's where the packets ingressed from. - */ - if (!br_vlan_enabled(br)) { - /* Safe because __dev_queue_xmit() runs under - * rcu_read_lock_bh() - */ - err = br_vlan_get_pvid_rcu(br, &pvid); - if (err) - return NULL; - } } else { cmd = DSA_CMD_FROM_CPU; tag_dev = dp->ds->index; @@ -180,16 +165,21 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, dsa_header[2] &= ~0x10; } } else { + struct net_device *br = dp->bridge_dev; + u16 vid; + + vid = br ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE; + skb_push(skb, DSA_HLEN + extra); dsa_alloc_etype_header(skb, DSA_HLEN + extra); - /* Construct untagged DSA tag. */ + /* Construct DSA header from untagged frame. */ dsa_header = dsa_etype_header_pos_tx(skb) + extra; dsa_header[0] = (cmd << 6) | tag_dev; dsa_header[1] = tag_port << 3; - dsa_header[2] = pvid >> 8; - dsa_header[3] = pvid & 0xff; + dsa_header[2] = vid >> 8; + dsa_header[3] = vid & 0xff; } return skb; @@ -210,7 +200,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, cmd = dsa_header[0] >> 6; switch (cmd) { case DSA_CMD_FORWARD: - trunk = !!(dsa_header[1] & 7); + trunk = !!(dsa_header[1] & 4); break; case DSA_CMD_TO_CPU: diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 8025ed778d33..605b51ca6921 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -2,7 +2,6 @@ /* Copyright 2019 NXP */ #include <linux/dsa/ocelot.h> -#include <soc/mscc/ocelot.h> #include "dsa_priv.h" static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 59072930cb02..3412051981d7 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -9,10 +9,32 @@ * that on egress */ #include <linux/dsa/8021q.h> -#include <soc/mscc/ocelot.h> -#include <soc/mscc/ocelot_ptp.h> +#include <linux/dsa/ocelot.h> #include "dsa_priv.h" +static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp, + struct sk_buff *skb) +{ + struct felix_deferred_xmit_work *xmit_work; + struct felix_port *felix_port = dp->priv; + + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + if (!xmit_work) + return NULL; + + /* Calls felix_port_deferred_xmit in felix.c */ + kthread_init_work(&xmit_work->work, felix_port->xmit_work_fn); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + xmit_work->dp = dp; + xmit_work->skb = skb_get(skb); + + kthread_queue_work(felix_port->xmit_worker, &xmit_work->work); + + return NULL; +} + static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -20,18 +42,10 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - struct ocelot *ocelot = dp->ds->priv; - int port = dp->index; - u32 rew_op = 0; + struct ethhdr *hdr = eth_hdr(skb); - rew_op = ocelot_ptp_rew_op(skb); - if (rew_op) { - if (!ocelot_can_inject(ocelot, 0)) - return NULL; - - ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); - return NULL; - } + if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest)) + return ocelot_defer_xmit(dp, skb); return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index c054f48541c8..2edede9ddac9 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -4,6 +4,7 @@ #include <linux/if_vlan.h> #include <linux/dsa/sja1105.h> #include <linux/dsa/8021q.h> +#include <linux/skbuff.h> #include <linux/packing.h> #include "dsa_priv.h" @@ -53,6 +54,11 @@ #define SJA1110_TX_TRAILER_LEN 4 #define SJA1110_MAX_PADDING_LEN 15 +enum sja1110_meta_tstamp { + SJA1110_META_TSTAMP_TX = 0, + SJA1110_META_TSTAMP_RX = 1, +}; + /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ static inline bool sja1105_is_link_local(const struct sk_buff *skb) { @@ -520,6 +526,43 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, is_meta); } +static void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, + u8 ts_id, enum sja1110_meta_tstamp dir, + u64 tstamp) +{ + struct sk_buff *skb, *skb_tmp, *skb_match = NULL; + struct dsa_port *dp = dsa_to_port(ds, port); + struct skb_shared_hwtstamps shwt = {0}; + struct sja1105_port *sp = dp->priv; + + if (!dsa_port_is_sja1105(dp)) + return; + + /* We don't care about RX timestamps on the CPU port */ + if (dir == SJA1110_META_TSTAMP_RX) + return; + + spin_lock(&sp->data->skb_txtstamp_queue.lock); + + skb_queue_walk_safe(&sp->data->skb_txtstamp_queue, skb, skb_tmp) { + if (SJA1105_SKB_CB(skb)->ts_id != ts_id) + continue; + + __skb_unlink(skb, &sp->data->skb_txtstamp_queue); + skb_match = skb; + + break; + } + + spin_unlock(&sp->data->skb_txtstamp_queue.lock); + + if (WARN_ON(!skb_match)) + return; + + shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp)); + skb_complete_tx_timestamp(skb_match, &shwt); +} + static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) { u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8b30cadff708..b7e277d8a84d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1054,14 +1054,19 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio); if (!ext_hdr || !iio) goto send_mal_query; - if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr)) + if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) || + ntohs(iio->extobj_hdr.length) > sizeof(_iio)) goto send_mal_query; ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr); + iio = skb_header_pointer(skb, sizeof(_ext_hdr), + sizeof(iio->extobj_hdr) + ident_len, &_iio); + if (!iio) + goto send_mal_query; + status = 0; dev = NULL; switch (iio->extobj_hdr.class_type) { case ICMP_EXT_ECHO_CTYPE_NAME: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); if (ident_len >= IFNAMSIZ) goto send_mal_query; memset(buff, 0, sizeof(buff)); @@ -1069,30 +1074,24 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) dev = dev_get_by_name(net, buff); break; case ICMP_EXT_ECHO_CTYPE_INDEX: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(iio->ident.ifindex), &_iio); if (ident_len != sizeof(iio->ident.ifindex)) goto send_mal_query; dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); break; case ICMP_EXT_ECHO_CTYPE_ADDR: - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) || + ident_len != sizeof(iio->ident.addr.ctype3_hdr) + iio->ident.addr.ctype3_hdr.addrlen) goto send_mal_query; switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) { case ICMP_AFI_IP: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(struct in_addr), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr)) goto send_mal_query; dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in6_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr)) goto send_mal_query; dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); dev_hold(dev); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 80aeaf9e6e16..bfb522e51346 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -242,8 +242,10 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; + score = sk->sk_bound_dev_if ? 2 : 1; - score = sk->sk_family == PF_INET ? 2 : 1; + if (sk->sk_family == PF_INET) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 613432a36f0a..e61ea428ea18 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -20,13 +20,8 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -static unsigned int defrag4_pernet_id __read_mostly; static DEFINE_MUTEX(defrag4_mutex); -struct defrag4_pernet { - unsigned int users; -}; - static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -111,19 +106,15 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = { static void __net_exit defrag4_net_exit(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - - if (nf_defrag->users) { + if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); - nf_defrag->users = 0; + net->nf.defrag_ipv4_users = 0; } } static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, - .id = &defrag4_pernet_id, - .size = sizeof(struct defrag4_pernet), }; static int __init nf_defrag_init(void) @@ -138,24 +129,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv4_enable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; mutex_lock(&defrag4_mutex); - if (nf_defrag->users == UINT_MAX) { + if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_defrag->users) { - nf_defrag->users++; + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) - nf_defrag->users = 1; + net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); @@ -165,12 +155,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - mutex_lock(&defrag4_mutex); - if (nf_defrag->users) { - nf_defrag->users--; - if (nf_defrag->users == 0) + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users--; + if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e8b48df73c85..f5c336f8b0c8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -486,10 +486,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; - - if (sk->sk_prot->stream_memory_read) - return sk->sk_prot->stream_memory_read(sk); - return false; + return sk_is_readable(sk); } /* diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index d3e9386b493e..5f4d6f45d87f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL -static bool tcp_bpf_stream_read(const struct sock *sk) -{ - struct sk_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) - empty = list_empty(&psock->ingress_msg); - rcu_read_unlock(); - return !empty; -} - static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { @@ -232,6 +219,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, delta = 0; + u32 eval = __SK_NONE; int ret; more_data: @@ -275,13 +263,24 @@ more_data: case __SK_REDIRECT: sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); + if (!psock->apply_bytes) { + /* Clean up before releasing the sock lock. */ + eval = psock->eval; + psock->eval = __SK_NONE; + psock->sk_redir = NULL; + } if (psock->cork) { cork = true; psock->cork = NULL; } sk_msg_return(sk, msg, tosend); release_sock(sk); + ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + + if (eval == __SK_REDIRECT) + sock_put(sk_redir); + lock_sock(sk); if (unlikely(ret < 0)) { int free = sk_msg_free_nocharge(sk, msg); @@ -479,7 +478,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; - prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; + prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2e62e0d6373a..5b8ce65dfc06 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1037,6 +1037,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); +static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) +{ + if (!old) + return true; + + /* l3index always overrides non-l3index */ + if (old->l3index && new->l3index == 0) + return false; + if (old->l3index == 0 && new->l3index) + return true; + + return old->prefixlen < new->prefixlen; +} + /* Find the Key structure for an address. */ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, @@ -1059,7 +1073,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, lockdep_sock_is_held(sk)) { if (key->family != family) continue; - if (key->l3index && key->l3index != l3index) + if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); @@ -1074,8 +1088,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, match = false; } - if (match && (!best_match || - key->prefixlen > best_match->prefixlen)) + if (match && better_md5_match(best_match, key)) best_match = key; } return best_match; @@ -1085,7 +1098,7 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, - int l3index) + int l3index, u8 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1105,7 +1118,9 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, lockdep_sock_is_held(sk)) { if (key->family != family) continue; - if (key->l3index && key->l3index != l3index) + if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) + continue; + if (key->l3index != l3index) continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) @@ -1129,7 +1144,7 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index, + int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ @@ -1137,7 +1152,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (key) { /* Pre-existing entry - just update that one. * Note that the key might be used concurrently. @@ -1182,6 +1197,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->family = family; key->prefixlen = prefixlen; key->l3index = l3index; + key->flags = flags; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1191,11 +1207,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen, int l3index) + u8 prefixlen, int l3index, u8 flags) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1229,6 +1245,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, const union tcp_md5_addr *addr; u8 prefixlen = 32; int l3index = 0; + u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1239,6 +1256,8 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, if (sin->sin_family != AF_INET) return -EINVAL; + flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; @@ -1246,7 +1265,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } - if (optname == TCP_MD5SIG_EXT && + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; @@ -1267,12 +1286,12 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1596,7 +1615,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2a7825a5b842..2fffcf2b54f3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -390,7 +390,8 @@ static int compute_score(struct sock *sk, struct net *net, dif, sdif); if (!dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if) + score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; @@ -2866,6 +2867,9 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); + /* psock ingress_msg queue should not contain any bad checksum frames */ + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; return mask; } diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a1d5f473878..bbe6569c9ad3 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -114,6 +114,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) *prot = *base; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 55c290d55605..67c9114835c8 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -106,7 +106,7 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; - score = 1; + score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 5e8961004832..d128172bb549 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -770,6 +770,66 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, data += sizeof(__be32); } + /* bit12 undefined: filled with empty value */ + if (trace->type.bit12) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit13 undefined: filled with empty value */ + if (trace->type.bit13) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit14 undefined: filled with empty value */ + if (trace->type.bit14) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit15 undefined: filled with empty value */ + if (trace->type.bit15) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit16 undefined: filled with empty value */ + if (trace->type.bit16) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit17 undefined: filled with empty value */ + if (trace->type.bit17) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit18 undefined: filled with empty value */ + if (trace->type.bit18) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit19 undefined: filled with empty value */ + if (trace->type.bit19) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit20 undefined: filled with empty value */ + if (trace->type.bit20) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit21 undefined: filled with empty value */ + if (trace->type.bit21) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + /* opaque state snapshot */ if (trace->type.bit22) { if (!sc) { @@ -791,16 +851,10 @@ void ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc; u8 sclen = 0; - /* Skip if Overflow flag is set OR - * if an unknown type (bit 12-21) is set + /* Skip if Overflow flag is set */ - if (trace->overflow || - trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | - trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | - trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | - trace->type.bit21) { + if (trace->overflow) return; - } /* NodeLen does not include Opaque State Snapshot length. We need to * take it into account if the corresponding bit is set (bit 22) and diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index f9ee04541c17..9b7b726f8f45 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -75,7 +75,11 @@ static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) u32 fields; if (!trace->type_be32 || !trace->remlen || - trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4) + trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 || + trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | + trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | + trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | + trace->type.bit21) return false; trace->nodelen = 0; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 12f985f43bcc..2f044a49afa8 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -464,13 +464,14 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) int ip6_forward(struct sk_buff *skb) { - struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); + struct inet6_dev *idev; u32 mtu; + idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); if (net->ipv6.devconf_all->forwarding == 0) goto error; diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 733c83d38b30..4ad8b2032f1f 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -25,12 +25,7 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static inline bool segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { - bool r; - pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n", - invert ? '!' : ' ', min, id, max); - r = (id >= min && id <= max) ^ invert; - pr_debug(" result %s\n", r ? "PASS" : "FAILED"); - return r; + return (id >= min && id <= max) ^ invert; } static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) @@ -65,30 +60,6 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) return false; } - pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen); - pr_debug("TYPE %04X ", rh->type); - pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left); - - pr_debug("IPv6 RT segsleft %02X ", - segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], - rh->segments_left, - !!(rtinfo->invflags & IP6T_RT_INV_SGS))); - pr_debug("type %02X %02X %02X ", - rtinfo->rt_type, rh->type, - (!(rtinfo->flags & IP6T_RT_TYP) || - ((rtinfo->rt_type == rh->type) ^ - !!(rtinfo->invflags & IP6T_RT_INV_TYP)))); - pr_debug("len %02X %04X %02X ", - rtinfo->hdrlen, hdrlen, - !(rtinfo->flags & IP6T_RT_LEN) || - ((rtinfo->hdrlen == hdrlen) ^ - !!(rtinfo->invflags & IP6T_RT_INV_LEN))); - pr_debug("res %02X %02X %02X ", - rtinfo->flags & IP6T_RT_RES, - ((const struct rt0_hdr *)rh)->reserved, - !((rtinfo->flags & IP6T_RT_RES) && - (((const struct rt0_hdr *)rh)->reserved))); - ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && @@ -107,22 +78,22 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) reserved), sizeof(_reserved), &_reserved); + if (!rp) { + par->hotdrop = true; + return false; + } ret = (*rp == 0); } - pr_debug("#%d ", rtinfo->addrnr); if (!(rtinfo->flags & IP6T_RT_FST)) { return ret; } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { - pr_debug("Not strict "); if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { - pr_debug("There isn't enough space\n"); return false; } else { unsigned int i = 0; - pr_debug("#%d ", rtinfo->addrnr); for (temp = 0; temp < (unsigned int)((hdrlen - 8) / 16); temp++) { @@ -138,26 +109,20 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) return false; } - if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { - pr_debug("i=%d temp=%d;\n", i, temp); + if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) i++; - } if (i == rtinfo->addrnr) break; } - pr_debug("i=%d #%d\n", i, rtinfo->addrnr); if (i == rtinfo->addrnr) return ret; else return false; } } else { - pr_debug("Strict "); if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { - pr_debug("There isn't enough space\n"); return false; } else { - pr_debug("#%d ", rtinfo->addrnr); for (temp = 0; temp < rtinfo->addrnr; temp++) { ap = skb_header_pointer(skb, ptr @@ -173,7 +138,6 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) break; } - pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr); if (temp == rtinfo->addrnr && temp == (unsigned int)((hdrlen - 8) / 16)) return ret; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index a0108415275f..5c47be29b9ee 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,7 +33,7 @@ static const char nf_frags_cache_name[] = "nf-frags"; -unsigned int nf_frag_pernet_id __read_mostly; +static unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index e8a59d8bf2ad..cb4eb1d2c620 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -25,8 +25,6 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> -extern unsigned int nf_frag_pernet_id; - static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, @@ -91,12 +89,10 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = { static void __net_exit defrag6_net_exit(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - - if (nf_frag->users) { + if (net->nf.defrag_ipv6_users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); - nf_frag->users = 0; + net->nf.defrag_ipv6_users = 0; } } @@ -134,24 +130,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv6_enable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; mutex_lock(&defrag6_mutex); - if (nf_frag->users == UINT_MAX) { + if (net->nf.defrag_ipv6_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_frag->users) { - nf_frag->users++; + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) - nf_frag->users = 1; + net->nf.defrag_ipv6_users = 1; out_unlock: mutex_unlock(&defrag6_mutex); @@ -161,12 +156,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); void nf_defrag_ipv6_disable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - mutex_lock(&defrag6_mutex); - if (nf_frag->users) { - nf_frag->users--; - if (nf_frag->users == 0) + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users--; + if (net->nf.defrag_ipv6_users == 0) nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0ce52d46e4f8..b03dd02c9f13 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -599,6 +599,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; int l3index = 0; u8 prefixlen; + u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; @@ -609,6 +610,8 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (sin6->sin6_family != AF_INET6) return -EINVAL; + flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; @@ -619,7 +622,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } - if (optname == TCP_MD5SIG_EXT && + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; @@ -640,9 +643,9 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, - l3index); + l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, l3index); + AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) @@ -650,12 +653,12 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen, l3index, + AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, l3index, + AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1404,7 +1407,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, l3index, key->key, key->keylen, + AF_INET6, 128, l3index, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } #endif diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e505bb007e9f..8d785232b479 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -133,7 +133,8 @@ static int compute_score(struct sock *sk, struct net *net, dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; - score++; + if (sk->sk_bound_dev_if) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 97095b7c9c64..5dcfd53a4ab6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -672,7 +672,7 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata, u8 *ie, u8 ie_len) { struct ieee80211_supported_band *sband; - const u8 *cap; + const struct element *cap; const struct ieee80211_he_operation *he_oper = NULL; sband = ieee80211_get_sband(sdata); @@ -687,9 +687,10 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.he_support = true; - cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ie, ie_len); - if (cap && cap[1] >= ieee80211_he_oper_size(&cap[3])) - he_oper = (void *)(cap + 3); + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ie_len); + if (cap && cap->datalen >= 1 + sizeof(*he_oper) && + cap->datalen >= 1 + ieee80211_he_oper_size(cap->data + 1)) + he_oper = (void *)(cap->data + 1); if (he_oper) sdata->vif.bss_conf.he_oper.params = diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c41273cefc51..f0f22eb4fd5f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -485,11 +485,11 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; - /* we will check ext_copy.data_len in mptcp_write_options() to + /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ - opts->ext_copy.data_len = data_len; + opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; @@ -505,9 +505,9 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ - opts->ext_copy.data_seq = mpext->data_seq; - opts->ext_copy.subflow_seq = mpext->subflow_seq; - opts->ext_copy.csum = mpext->csum; + opts->data_seq = mpext->data_seq; + opts->subflow_seq = mpext->subflow_seq; + opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); @@ -1227,7 +1227,7 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } -static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum16 sum) { struct csum_pseudo_header header; __wsum csum; @@ -1237,15 +1237,21 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext) * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ - header.data_seq = cpu_to_be64(mpext->data_seq); - header.subflow_seq = htonl(mpext->subflow_seq); - header.data_len = htons(mpext->data_len); + header.data_seq = cpu_to_be64(data_seq); + header.subflow_seq = htonl(subflow_seq); + header.data_len = htons(data_len); header.csum = 0; - csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + csum = csum_partial(&header, sizeof(header), ~csum_unfold(sum)); return (__force u16)csum_fold(csum); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, + mpext->csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { @@ -1337,7 +1343,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - } else if (opts->ext_copy.data_len) { + } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; @@ -1366,14 +1372,17 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; - if (!opts->ext_copy.data_len) + if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { - put_unaligned_be32(opts->ext_copy.data_len << 16 | - mptcp_make_csum(&opts->ext_copy), ptr); + put_unaligned_be32(opts->data_len << 16 | + __mptcp_make_csum(opts->data_seq, + opts->subflow_seq, + opts->data_len, + opts->csum), ptr); } else { - put_unaligned_be32(opts->ext_copy.data_len << 16 | + put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e5df0b5971c8..d073b2111382 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,7 +528,6 @@ static bool mptcp_check_data_fin(struct sock *sk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); switch (sk->sk_state) { case TCP_ESTABLISHED: @@ -742,10 +741,9 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk)) { - set_bit(MPTCP_DATA_READY, &msk->flags); + if (move_skbs_to_msk(msk, ssk)) sk->sk_data_ready(sk); - } + mptcp_data_unlock(sk); } @@ -847,7 +845,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } @@ -1759,21 +1756,6 @@ out: return copied ? : ret; } -static void mptcp_wait_data(struct sock *sk, long *timeo) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct mptcp_sock *msk = mptcp_sk(sk); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - - sk_wait_event(sk, timeo, - test_bit(MPTCP_DATA_READY, &msk->flags), &wait); - - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); -} - static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -2077,19 +2059,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld", timeo); - mptcp_wait_data(sk, &timeo); - } - - if (skb_queue_empty_lockless(&sk->sk_receive_queue) && - skb_queue_empty(&msk->receive_queue)) { - /* entire backlog drained, clear DATA_READY. */ - clear_bit(MPTCP_DATA_READY, &msk->flags); - - /* .. race-breaker: ssk might have gotten new data - * after last __mptcp_move_skbs() returned false. - */ - if (unlikely(__mptcp_move_skbs(msk))) - set_bit(MPTCP_DATA_READY, &msk->flags); + sk_wait_data(sk, &timeo, NULL); } out_err: @@ -2098,9 +2068,9 @@ out_err: tcp_recv_timestamp(msg, sk, &tss); } - pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", - msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty_lockless(&sk->sk_receive_queue), copied); + pr_debug("msk=%p rx queue empty=%d:%d copied=%d", + msk, skb_queue_empty_lockless(&sk->sk_receive_queue), + skb_queue_empty(&msk->receive_queue), copied); if (!(flags & MSG_PEEK)) mptcp_rcv_space_adjust(msk, copied); @@ -2368,7 +2338,6 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); mptcp_close_wake_up(sk); @@ -3385,8 +3354,14 @@ unlock_fail: static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { - return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : - 0; + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) @@ -3421,7 +3396,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) - return mptcp_check_readable(msk); + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 54395266339d..92a747896f80 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -109,7 +109,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' depends on NETWORK_SECMARK - default m if NETFILTER_ADVANCED=n + default y if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c25097092a06..29ec3ef63edc 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -4090,6 +4090,11 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; +#ifdef CONFIG_IP_VS_DEBUG + /* Global sysctls must be ro in non-init netns */ + if (!net_eq(net, &init_net)) + tbl[idx++].mode = 0444; +#endif ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b9546defdc28..c0851fec11d4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -780,6 +780,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -790,8 +791,11 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table); + event, flags, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -1563,6 +1567,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -1573,8 +1578,11 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, + event, flags, ctx->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -2866,8 +2874,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, - const struct nft_rule *prule) + const struct nft_rule *rule, u64 handle) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; @@ -2887,9 +2894,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, NFTA_RULE_PAD)) goto nla_put_failure; - if (event != NFT_MSG_DELRULE && prule) { - if (nla_put_be64(skb, NFTA_RULE_POSITION, - cpu_to_be64(prule->handle), + if (event != NFT_MSG_DELRULE && handle) { + if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle), NFTA_RULE_PAD)) goto nla_put_failure; } @@ -2925,7 +2931,10 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); + const struct nft_rule *prule; struct sk_buff *skb; + u64 handle = 0; + u16 flags = 0; int err; if (!ctx->report && @@ -2936,9 +2945,20 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (event == NFT_MSG_NEWRULE && + !list_is_first(&rule->list, &ctx->chain->rules) && + !list_is_last(&rule->list, &ctx->chain->rules)) { + prule = list_prev_entry(rule, list); + handle = prule->handle; + } + if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE)) + flags |= NLM_F_APPEND; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, - ctx->chain, rule, NULL); + event, flags, ctx->family, ctx->table, + ctx->chain, rule, handle); if (err < 0) { kfree_skb(skb); goto err; @@ -2964,6 +2984,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; unsigned int s_idx = cb->args[0]; + u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { @@ -2975,12 +2996,17 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); } + if (prule) + handle = prule->handle; + else + handle = 0; + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, prule) < 0) + table, chain, rule, handle) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3143,7 +3169,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, NULL); + family, table, chain, rule, 0); if (err < 0) goto err_fill_rule_info; @@ -3403,17 +3429,15 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + err = nft_delrule(&ctx, old_rule); + if (err < 0) + goto err_destroy_flow_rule; + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err_destroy_flow_rule; } - err = nft_delrule(&ctx, old_rule); - if (err < 0) { - nft_trans_destroy(trans); - goto err_destroy_flow_rule; - } - list_add_tail_rcu(&rule->list, &old_rule->list); } else { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); @@ -3943,8 +3967,9 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, gfp_t gfp_flags) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); - struct sk_buff *skb; u32 portid = ctx->portid; + struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -3955,7 +3980,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_set(skb, ctx, set, event, 0); + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + + err = nf_tables_fill_set(skb, ctx, set, event, flags); if (err < 0) { kfree_skb(skb); goto err; @@ -5231,12 +5259,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb, static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_set_elem *elem, - int event, u16 flags) + int event) { struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -5246,6 +5275,9 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, set, elem); if (err < 0) { @@ -6921,7 +6953,7 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) + u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; @@ -6946,8 +6978,9 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, if (skb == NULL) goto err; - err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family, - table, obj, false); + err = nf_tables_fill_obj_info(skb, net, portid, seq, event, + flags & (NLM_F_CREATE | NLM_F_EXCL), + family, table, obj, false); if (err < 0) { kfree_skb(skb); goto err; @@ -6964,7 +6997,7 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->family, ctx->report, GFP_KERNEL); + ctx->flags, ctx->family, ctx->report, GFP_KERNEL); } /* @@ -7745,6 +7778,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -7755,8 +7789,11 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, - ctx->seq, event, 0, + ctx->seq, event, flags, ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); @@ -8634,7 +8671,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_setelem_activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_NEWSETELEM, 0); + NFT_MSG_NEWSETELEM); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -8642,7 +8679,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM, 0); + NFT_MSG_DELSETELEM); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 5b02408a920b..3ced0eb6b7c3 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -342,12 +342,6 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; } - /* UNREGISTER events are also happening on netns exit. - * - * Although nf_tables core releases all tables/chains, only this event - * handler provides guarantee that hook->ops.dev is still accessible, - * so we cannot skip exiting net namespaces. - */ __nft_release_basechain(ctx); } @@ -366,6 +360,9 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; + if (!check_net(ctx.net)) + return NOTIFY_DONE; + nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 0363f533a42b..c4d1389f7185 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -60,7 +60,7 @@ static void nft_quota_obj_eval(struct nft_object *obj, if (overquota && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, - NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); + NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); } static int nft_quota_do_init(const struct nlattr * const tb[], diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 7b2f359bfce4..2f7cf5ecebf4 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -137,7 +137,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; - info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 24b7cf447bc5..ada47e59647a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -594,7 +594,10 @@ static int netlink_insert(struct sock *sk, u32 portid) /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); - nlk_sk(sk)->bound = portid; + /* Paired with lockless reads from netlink_bind(), + * netlink_connect() and netlink_sendmsg(). + */ + WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); @@ -1012,7 +1015,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; - bound = nlk->bound; + /* Paired with WRITE_ONCE() in netlink_insert() */ + bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); @@ -1098,8 +1102,9 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, /* No need for barriers here as we return to user-space without * using any of the bound attributes. + * Paired with WRITE_ONCE() in netlink_insert(). */ - if (!nlk->bound) + if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { @@ -1888,7 +1893,8 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->bound) { + /* Paired with WRITE_ONCE() in netlink_insert() */ + if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 6024fad905ff..dda323e0a473 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -60,6 +60,9 @@ int nfc_proto_register(const struct nfc_protocol *nfc_proto) proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); + if (rc) + proto_unregister(nfc_proto->proto); + return rc; } EXPORT_SYMBOL(nfc_proto_register); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index fefc03674f4f..d63d2e5dc60c 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -277,6 +277,7 @@ int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) { struct digital_tg_mdaa_params *params; + int rc; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) @@ -291,8 +292,12 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) get_random_bytes(params->nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); params->sc = DIGITAL_SENSF_FELICA_SC; - return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, - 500, digital_tg_recv_atr_req, NULL); + rc = digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, + 500, digital_tg_recv_atr_req, NULL); + if (rc) + kfree(params); + + return rc; } static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech) diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index 84d2345c75a3..3adf4589852a 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -465,8 +465,12 @@ static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev, skb_put_u8(skb, sel_cmd); skb_put_u8(skb, DIGITAL_SDD_REQ_SEL_PAR); - return digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, - target); + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, + target); + if (rc) + kfree_skb(skb); + + return rc; } static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg, diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index a2e72c003805..b911ab78bed9 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -334,6 +334,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, ndev->cur_conn_id); if (conn_info) { list_del(&conn_info->list); + if (conn_info == ndev->rf_conn_info) + ndev->rf_conn_info = NULL; devm_kfree(&ndev->nfc_dev->dev, conn_info); } } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index ad9df0cb4b98..90866ae45573 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -960,6 +960,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tmpl = p->tmpl; tcf_lastuse_update(&c->tcf_tm); + tcf_action_update_bstats(&c->common, skb); if (clear) { qdisc_skb_cb(skb)->post_ct = false; @@ -1049,7 +1050,6 @@ out_push: qdisc_skb_cb(skb)->post_ct = true; out_clear: - tcf_action_update_bstats(&c->common, skb); if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; return retval; diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index a579a4131d22..e1040421b797 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -233,6 +233,9 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; + if (!q->ops->change) + return 0; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 8766ab5b8788..5eb3b1b7ae5e 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -529,22 +529,28 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); struct Qdisc *qdisc = rtnl_dereference(q->qdisc); - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; - struct gnet_stats_queue __percpu *cpu_qstats = NULL; spin_lock_bh(qdisc_lock(qdisc)); + if (qdisc_is_percpu_stats(qdisc)) { - cpu_bstats = qdisc->cpu_bstats; - cpu_qstats = qdisc->cpu_qstats; + qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&qstats, + qdisc->cpu_qstats, + &qdisc->qstats, + qlen); + } else { + qlen += qdisc->q.qlen; + bstats.bytes += qdisc->bstats.bytes; + bstats.packets += qdisc->bstats.packets; + qstats.backlog += qdisc->qstats.backlog; + qstats.drops += qdisc->qstats.drops; + qstats.requeues += qdisc->qstats.requeues; + qstats.overlimits += qdisc->qstats.overlimits; } - - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - cpu_bstats, &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - cpu_qstats, - &qdisc->qstats, - qlen); spin_unlock_bh(qdisc_lock(qdisc)); } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 1ab2fc933a21..b9fd18d98646 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1641,6 +1641,10 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); + /* Note that taprio_reset() might not be called if an error + * happens in qdisc_create(), after taprio_init() has been called. + */ + hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index b8fa8f1a7277..c7503fd64915 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3697,7 +3697,7 @@ struct sctp_chunk *sctp_make_strreset_req( outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; - retval = sctp_make_reconf(asoc, outlen + inlen); + retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 32df65f68c12..fb3da4d8f4a3 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -156,6 +156,12 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort( void *arg, struct sctp_cmd_seq *commands); +static enum sctp_disposition +__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, void *arg, + struct sctp_cmd_seq *commands); + /* Small helper function that checks if the chunk length * is of the appropriate length. The 'required_length' argument * is set to be the size of a specific chunk we are testing. @@ -337,6 +343,14 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, if (!chunk->singleton) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the INIT chunk has a valid length. + * Normally, this would cause an ABORT with a Protocol Violation + * error, but since we don't have an association, we'll + * just discard the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* If the packet is an OOTB packet which is temporarily on the * control endpoint, respond with an ABORT. */ @@ -351,14 +365,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, if (chunk->sctp_hdr->vtag != 0) return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands); - /* Make sure that the INIT chunk has a valid length. - * Normally, this would cause an ABORT with a Protocol Violation - * error, but since we don't have an association, we'll - * just discard the packet. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* If the INIT is coming toward a closing socket, we'll send back * and ABORT. Essentially, this catches the race of INIT being * backloged to the socket at the same time as the user issues close(). @@ -704,6 +710,9 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, struct sock *sk; int error = 0; + if (asoc && !sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* If the packet is an OOTB packet which is temporarily on the * control endpoint, respond with an ABORT. */ @@ -718,7 +727,8 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, * in sctp_unpack_cookie(). */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); /* If the endpoint is not listening or if the number of associations * on the TCP-style socket exceed the max backlog, respond with an @@ -1524,20 +1534,16 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( if (!chunk->singleton) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the INIT chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* 3.1 A packet containing an INIT chunk MUST have a zero Verification * Tag. */ if (chunk->sctp_hdr->vtag != 0) return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands); - /* Make sure that the INIT chunk has a valid length. - * In this case, we generate a protocol violation since we have - * an association established. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); - if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port) return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands); @@ -1882,9 +1888,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( * its peer. */ if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) { - disposition = sctp_sf_do_9_2_reshutack(net, ep, asoc, - SCTP_ST_CHUNK(chunk->chunk_hdr->type), - chunk, commands); + disposition = __sctp_sf_do_9_2_reshutack(net, ep, asoc, + SCTP_ST_CHUNK(chunk->chunk_hdr->type), + chunk, commands); if (SCTP_DISPOSITION_NOMEM == disposition) goto nomem; @@ -2202,9 +2208,11 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( * enough for the chunk header. Cookie length verification is * done later. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) { + if (!sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); + } /* "Decode" the chunk. We have no optional parameters so we * are in good shape. @@ -2341,7 +2349,7 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2387,7 +2395,7 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2657,7 +2665,7 @@ enum sctp_disposition sctp_sf_do_9_1_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2970,13 +2978,11 @@ enum sctp_disposition sctp_sf_do_9_2_shut_ctsn( * that belong to this association, it should discard the INIT chunk and * retransmit the SHUTDOWN ACK chunk. */ -enum sctp_disposition sctp_sf_do_9_2_reshutack( - struct net *net, - const struct sctp_endpoint *ep, - const struct sctp_association *asoc, - const union sctp_subtype type, - void *arg, - struct sctp_cmd_seq *commands) +static enum sctp_disposition +__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, void *arg, + struct sctp_cmd_seq *commands) { struct sctp_chunk *chunk = arg; struct sctp_chunk *reply; @@ -3010,6 +3016,26 @@ nomem: return SCTP_DISPOSITION_NOMEM; } +enum sctp_disposition +sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, void *arg, + struct sctp_cmd_seq *commands) +{ + struct sctp_chunk *chunk = arg; + + if (!chunk->singleton) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + + if (chunk->sctp_hdr->vtag != 0) + return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands); + + return __sctp_sf_do_9_2_reshutack(net, ep, asoc, type, arg, commands); +} + /* * sctp_sf_do_ecn_cwr * @@ -3662,6 +3688,9 @@ enum sctp_disposition sctp_sf_ootb(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); + if (asoc && !sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + ch = (struct sctp_chunkhdr *)chunk->chunk_hdr; do { /* Report violation if the chunk is less then minimal */ @@ -3777,12 +3806,6 @@ static enum sctp_disposition sctp_sf_shut_8_4_5( SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); - /* If the chunk length is invalid, we don't want to process - * the reset of the packet. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* We need to discard the rest of the packet to prevent * potential boomming attacks from additional bundled chunks. * This is documented in SCTP Threats ID. @@ -3810,6 +3833,9 @@ enum sctp_disposition sctp_sf_do_8_5_1_E_sa(struct net *net, { struct sctp_chunk *chunk = arg; + if (!sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, @@ -3845,6 +3871,11 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + /* Make sure that the ASCONF ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk))) + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + /* ADD-IP: Section 4.1.1 * This chunk MUST be sent in an authenticated way by using * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk @@ -3853,13 +3884,7 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, */ if (!asoc->peer.asconf_capable || (!net->sctp.addip_noauth && !chunk->auth)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, - commands); - - /* Make sure that the ASCONF ADDIP chunk has a valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); hdr = (struct sctp_addiphdr *)chunk->skb->data; serial = ntohl(hdr->serial); @@ -3988,6 +4013,12 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + /* Make sure that the ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(asconf_ack, + sizeof(struct sctp_addip_chunk))) + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + /* ADD-IP, Section 4.1.2: * This chunk MUST be sent in an authenticated way by using * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk @@ -3996,14 +4027,7 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, */ if (!asoc->peer.asconf_capable || (!net->sctp.addip_noauth && !asconf_ack->auth)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, - commands); - - /* Make sure that the ADDIP chunk has a valid length. */ - if (!sctp_chunk_length_valid(asconf_ack, - sizeof(struct sctp_addip_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); addip_hdr = (struct sctp_addiphdr *)asconf_ack->skb->data; rcvd_serial = ntohl(addip_hdr->serial); @@ -4575,6 +4599,9 @@ enum sctp_disposition sctp_sf_discard_chunk(struct net *net, { struct sctp_chunk *chunk = arg; + if (asoc && !sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the chunk has a valid length. * Since we don't know the chunk type, we use a general * chunkhdr structure to make a comparison. @@ -4642,6 +4669,9 @@ enum sctp_disposition sctp_sf_violation(struct net *net, { struct sctp_chunk *chunk = arg; + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, @@ -6348,6 +6378,7 @@ static struct sctp_packet *sctp_ootb_pkt_new( * yet. */ switch (chunk->chunk_hdr->type) { + case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: { struct sctp_initack_chunk *initack; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c038efc23ce3..78b663dbfa1f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1057,7 +1057,7 @@ static void smc_connect_work(struct work_struct *work) if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; } else if ((1 << smc->clcsock->sk->sk_state) & - (TCPF_SYN_SENT | TCP_SYN_RECV)) { + (TCPF_SYN_SENT | TCPF_SYN_RECV)) { rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); if ((rc == -EPIPE) && ((1 << smc->clcsock->sk->sk_state) & diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index f23f558054a7..99acd337ba90 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -150,9 +150,11 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) again: link = conn->lnk; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) - return rc; + goto put_out; spin_lock_bh(&conn->send_lock); if (link != conn->lnk) { @@ -160,6 +162,7 @@ again: spin_unlock_bh(&conn->send_lock); smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); + smc_wr_tx_link_put(link); if (again) return -ENOLINK; again = true; @@ -167,6 +170,8 @@ again: } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); +put_out: + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8280c938be80..d2206743dc71 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -949,7 +949,7 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, to_lnk = &lgr->lnk[i]; break; } - if (!to_lnk) { + if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { smc_lgr_terminate_sched(lgr); return NULL; } @@ -981,24 +981,26 @@ again: read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; goto again; } read_unlock_bh(&lgr->conns_lock); + smc_wr_tx_link_put(to_lnk); return to_lnk; + +err_out: + smcr_link_down_cond_sched(to_lnk); + smc_wr_tx_link_put(to_lnk); + return NULL; } static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2e7560eba981..f1d323439a2a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -383,9 +383,11 @@ int smc_llc_send_confirm_link(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; confllc = (struct smc_llc_msg_confirm_link *)wr_buf; memset(confllc, 0, sizeof(*confllc)); confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; @@ -402,6 +404,8 @@ int smc_llc_send_confirm_link(struct smc_link *link, confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -415,9 +419,11 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_link *link; int i, rc, rtok_ix; + if (!smc_wr_tx_link_hold(send_link)) + return -ENOLINK; rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; @@ -444,6 +450,8 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); +put_out: + smc_wr_tx_link_put(send_link); return rc; } @@ -456,9 +464,11 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; @@ -467,6 +477,8 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -480,9 +492,11 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; addllc = (struct smc_llc_msg_add_link *)wr_buf; memset(addllc, 0, sizeof(*addllc)); @@ -504,6 +518,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], } /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -517,9 +533,11 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; delllc = (struct smc_llc_msg_del_link *)wr_buf; memset(delllc, 0, sizeof(*delllc)); @@ -536,6 +554,8 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -547,9 +567,11 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; testllc = (struct smc_llc_msg_test_link *)wr_buf; memset(testllc, 0, sizeof(*testllc)); testllc->hd.common.type = SMC_LLC_TEST_LINK; @@ -557,6 +579,8 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -567,13 +591,16 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } /* schedule an llc send on link, may wait for buffers, @@ -586,13 +613,16 @@ static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); + rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); +put_out: + smc_wr_tx_link_put(link); + return rc; } /********************************* receive ***********************************/ @@ -672,9 +702,11 @@ static int smc_llc_add_link_cont(struct smc_link *link, struct smc_buf_desc *rmb; u8 n; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; memset(addc_llc, 0, sizeof(*addc_llc)); @@ -706,7 +738,10 @@ static int smc_llc_add_link_cont(struct smc_link *link, addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); if (lgr->role == SMC_CLNT) addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } static int smc_llc_cli_rkey_exchange(struct smc_link *link, @@ -1787,7 +1822,7 @@ void smc_llc_link_active(struct smc_link *link) link->smcibdev->ibdev->name, link->ibport); link->state = SMC_LNK_ACTIVE; if (link->lgr->llc_testlink_time) { - link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; + link->llc_testlink_time = link->lgr->llc_testlink_time; schedule_delayed_work(&link->llc_testlink_wrk, link->llc_testlink_time); } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index c79361dfcdfb..738a4a99c827 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -496,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; @@ -505,8 +505,11 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; + if (!link || !smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { + smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -547,22 +550,7 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) out_unlock: spin_unlock_bh(&conn->send_lock); - return rc; -} - -static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) -{ - struct smc_link *link = conn->lnk; - int rc = -ENOLINK; - - if (!link) - return rc; - - atomic_inc(&link->wr_tx_refcnt); - if (smc_link_usable(link)) - rc = _smcr_tx_sndbuf_nonempty(conn); - if (atomic_dec_and_test(&link->wr_tx_refcnt)) - wake_up_all(&link->wr_tx_wait); + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 423b8709f1c9..2bc626f230a5 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,20 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline bool smc_wr_tx_link_hold(struct smc_link *link) +{ + if (!smc_link_usable(link)) + return false; + atomic_inc(&link->wr_tx_refcnt); + return true; +} + +static inline void smc_wr_tx_link_put(struct smc_link *link) +{ + if (atomic_dec_and_test(&link->wr_tx_refcnt)) + wake_up_all(&link->wr_tx_wait); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 3e776e3dff91..1f2817195549 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -645,7 +645,7 @@ static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; - } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { + } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { goto toolow; } if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index c9391d38de85..dc60c32bb70d 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -2285,43 +2285,53 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) u16 key_gen = msg_key_gen(hdr); u16 size = msg_data_sz(hdr); u8 *data = msg_data(hdr); + unsigned int keylen; + + /* Verify whether the size can exist in the packet */ + if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) { + pr_debug("%s: message data size is too small\n", rx->name); + goto exit; + } + + keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); + + /* Verify the supplied size values */ + if (unlikely(size != keylen + sizeof(struct tipc_aead_key) || + keylen > TIPC_AEAD_KEY_SIZE_MAX)) { + pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); + goto exit; + } spin_lock(&rx->lock); if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) { pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name, rx->skey, key_gen, rx->key_gen); - goto exit; + goto exit_unlock; } /* Allocate memory for the key */ skey = kmalloc(size, GFP_ATOMIC); if (unlikely(!skey)) { pr_err("%s: unable to allocate memory for skey\n", rx->name); - goto exit; + goto exit_unlock; } /* Copy key from msg data */ - skey->keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); + skey->keylen = keylen; memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME); memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->keylen); - /* Sanity check */ - if (unlikely(size != tipc_aead_key_size(skey))) { - kfree(skey); - skey = NULL; - goto exit; - } - rx->key_gen = key_gen; rx->skey_mode = msg_key_mode(hdr); rx->skey = skey; rx->nokey = 0; mb(); /* for nokey flag */ -exit: +exit_unlock: spin_unlock(&rx->lock); +exit: /* Schedule the key attaching on this crypto */ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0))) return true; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index fde56ff49163..9ab81db8a654 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -681,12 +681,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_BASE][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_SW][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 4feb95e34b64..1b08b877a890 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -35,6 +35,7 @@ * SOFTWARE. */ +#include <linux/bug.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/splice.h> @@ -43,6 +44,14 @@ #include <net/strparser.h> #include <net/tls.h> +noinline void tls_err_abort(struct sock *sk, int err) +{ + WARN_ON_ONCE(err >= 0); + /* sk->sk_err should contain a positive error code. */ + sk->sk_err = -err; + sk_error_report(sk); +} + static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { @@ -419,7 +428,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); return rc; } @@ -450,7 +459,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) /* If err is already set on socket, return the same code */ if (sk->sk_err) { - ctx->async_wait.err = sk->sk_err; + ctx->async_wait.err = -sk->sk_err; } else { ctx->async_wait.err = err; tls_err_abort(sk, err); @@ -763,7 +772,7 @@ static int tls_push_record(struct sock *sk, int flags, msg_pl->sg.size + prot->tail_size, i); if (rc < 0) { if (rc != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); if (split) { tls_ctx->pending_open_record_frags = true; tls_merge_open_record(sk, rec, tmp, orig_end); @@ -1827,7 +1836,7 @@ int tls_sw_recvmsg(struct sock *sk, err = decrypt_skb_update(sk, skb, &msg->msg_iter, &chunk, &zc, async_capable); if (err < 0 && err != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto recv_end; } @@ -2007,7 +2016,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (err < 0) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto splice_read_end; } ctx->decrypted = 1; @@ -2026,7 +2035,7 @@ splice_read_end: return copied ? : err; } -bool tls_sw_stream_read(const struct sock *sk) +bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index efac5989edb5..78e08e82c08c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -828,7 +828,7 @@ static void unix_unhash(struct sock *sk) } struct proto unix_dgram_proto = { - .name = "UNIX-DGRAM", + .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, @@ -2882,6 +2882,9 @@ static int unix_shutdown(struct socket *sock, int mode) unix_state_lock(sk); sk->sk_shutdown |= mode; + if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && + mode == SHUTDOWN_MASK) + sk->sk_state = TCP_CLOSE; other = unix_peer(sk); if (other) sock_hold(other); @@ -2904,12 +2907,10 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - if (peer_mode == SHUTDOWN_MASK) { + if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); - other->sk_state = TCP_CLOSE; - } else if (peer_mode & RCV_SHUTDOWN) { + else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - } } if (other) sock_put(other); @@ -3051,6 +3052,8 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && @@ -3090,6 +3093,8 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET) { diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index b927e2baae50..452376c6f419 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -102,6 +102,7 @@ static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; } static void unix_stream_bpf_rebuild_protos(struct proto *prot, @@ -110,6 +111,7 @@ static void unix_stream_bpf_rebuild_protos(struct proto *prot, *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; prot->unhash = sock_map_unhash; } diff --git a/net/wireless/core.c b/net/wireless/core.c index 03323121ca50..aaba847d79eb 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -524,6 +524,7 @@ use_default_name: INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); INIT_WORK(&rdev->mgmt_registrations_update_wk, cfg80211_mgmt_registrations_update_wk); + spin_lock_init(&rdev->mgmt_registrations_lock); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -1279,7 +1280,6 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); - spin_lock_init(&wdev->mgmt_registrations_lock); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index b35d0db12f1d..1720abf36f92 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -100,6 +100,8 @@ struct cfg80211_registered_device { struct work_struct propagate_cac_done_wk; struct work_struct mgmt_registrations_update_wk; + /* lock for all wdev lists */ + spinlock_t mgmt_registrations_lock; /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 3aa69b375a10..783acd2c4211 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -452,9 +452,9 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) lockdep_assert_held(&rdev->wiphy.mtx); - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); return; } @@ -479,7 +479,7 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) rcu_read_unlock(); wdev->mgmt_registrations_need_update = 0; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } @@ -503,6 +503,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, int match_len, bool multicast_rx, struct netlink_ext_ack *extack) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; @@ -548,7 +549,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, if (!nreg) return -ENOMEM; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); @@ -583,7 +584,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, list_add(&nreg->list, &wdev->mgmt_registrations); } wdev->mgmt_registrations_need_update = 1; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); @@ -591,7 +592,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, out: kfree(nreg); - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); return err; } @@ -602,7 +603,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlportid != nlportid) @@ -615,7 +616,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) schedule_work(&rdev->mgmt_registrations_update_wk); } - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); if (nlportid && rdev->crit_proto_nlportid == nlportid) { rdev->crit_proto_nlportid = 0; @@ -628,15 +629,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(®->list); kfree(reg); } wdev->mgmt_registrations_need_update = 1; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); } @@ -784,7 +786,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, data = buf + ieee80211_hdrlen(mgmt->frame_control); data_len = len - ieee80211_hdrlen(mgmt->frame_control); - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { if (reg->frame_type != ftype) @@ -808,7 +810,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, break; } - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); trace_cfg80211_return_bool(result); return result; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 11c68b159324..adc0d14cfd86 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -418,14 +418,17 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, } ssid_len = ssid[1]; ssid = ssid + 2; - rcu_read_unlock(); /* check if nontrans_bss is in the list */ list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) { - if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) + if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) { + rcu_read_unlock(); return 0; + } } + rcu_read_unlock(); + /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; diff --git a/net/wireless/util.c b/net/wireless/util.c index 18dba3d7c638..a1a99a574984 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1028,14 +1028,14 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; - /* if it's part of a bridge, reject changing type to station/ibss */ - if (netif_is_bridge_port(dev) && - (ntype == NL80211_IFTYPE_ADHOC || - ntype == NL80211_IFTYPE_STATION || - ntype == NL80211_IFTYPE_P2P_CLIENT)) - return -EBUSY; - if (ntype != otype) { + /* if it's part of a bridge, reject changing type to station/ibss */ + if (netif_is_bridge_port(dev) && + (ntype == NL80211_IFTYPE_ADHOC || + ntype == NL80211_IFTYPE_STATION || + ntype == NL80211_IFTYPE_P2P_CLIENT)) + return -EBUSY; + dev->ieee80211_ptr->use_4addr = false; dev->ieee80211_ptr->mesh_id_up_len = 0; wdev_lock(dev->ieee80211_ptr); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 03b66d154b2b..3a3cb09eec12 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1961,24 +1961,65 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, return skb; } +static int xfrm_notify_userpolicy(struct net *net) +{ + struct xfrm_userpolicy_default *up; + int len = NLMSG_ALIGN(sizeof(*up)); + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err; + + skb = nlmsg_new(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_GETDEFAULT, sizeof(*up), 0); + if (nlh == NULL) { + kfree_skb(skb); + return -EMSGSIZE; + } + + up = nlmsg_data(nlh); + up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + + nlmsg_end(skb, nlh); + + rcu_read_lock(); + err = xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); + rcu_read_unlock(); + + return err; +} + static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_default *up = nlmsg_data(nlh); - u8 dirmask; - u8 old_default = net->xfrm.policy_default; - if (up->dirmask >= XFRM_USERPOLICY_DIRMASK_MAX) - return -EINVAL; + if (up->in == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_IN; + else if (up->in == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_IN; - dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK; + if (up->fwd == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_FWD; + else if (up->fwd == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_FWD; - net->xfrm.policy_default = (old_default & (0xff ^ dirmask)) - | (up->action << up->dirmask); + if (up->out == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_OUT; + else if (up->out == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_OUT; rt_genid_bump_all(net); + xfrm_notify_userpolicy(net); return 0; } @@ -1988,13 +2029,11 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *r_skb; struct nlmsghdr *r_nlh; struct net *net = sock_net(skb->sk); - struct xfrm_userpolicy_default *r_up, *up; + struct xfrm_userpolicy_default *r_up; int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default)); u32 portid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; - up = nlmsg_data(nlh); - r_skb = nlmsg_new(len, GFP_ATOMIC); if (!r_skb) return -ENOMEM; @@ -2007,8 +2046,12 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, r_up = nlmsg_data(r_nlh); - r_up->action = ((net->xfrm.policy_default & (1 << up->dirmask)) >> up->dirmask); - r_up->dirmask = up->dirmask; + r_up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; nlmsg_end(r_skb, r_nlh); return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4dc20be5fb96..5fd48a8d4f10 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -322,17 +322,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h -include $(BPF_SAMPLES_PATH)/Makefile.target -VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ - ../../../../vmlinux \ - /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ + $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ + $(abspath ./vmlinux) VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) -ifeq ($(VMLINUX_BTF),) -$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") -endif - $(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) ifeq ($(VMLINUX_H),) $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ @@ -340,6 +334,11 @@ else $(Q)cp "$(VMLINUX_H)" $@ endif +ifeq ($(VMLINUX_BTF),) + $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\ + build the kernel or set VMLINUX_BTF variable) +endif + clean-files += vmlinux.h # Get Clang's default includes on this system, as opposed to those seen by diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h index aee04534483a..29c3bb6ad1cd 100644 --- a/samples/bpf/bpf_insn.h +++ b/samples/bpf/bpf_insn.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* eBPF instruction mini library */ #ifndef __BPF_INSN_H #define __BPF_INSN_H diff --git a/samples/bpf/xdp_redirect_map_multi.bpf.c b/samples/bpf/xdp_redirect_map_multi.bpf.c index 8f59d430cb64..bb0a5a3bfcf0 100644 --- a/samples/bpf/xdp_redirect_map_multi.bpf.c +++ b/samples/bpf/xdp_redirect_map_multi.bpf.c @@ -5,11 +5,6 @@ #include "xdp_sample.bpf.h" #include "xdp_sample_shared.h" -enum { - BPF_F_BROADCAST = (1ULL << 3), - BPF_F_EXCLUDE_INGRESS = (1ULL << 4), -}; - struct { __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); __uint(key_size, sizeof(int)); diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 952e46876329..4aad28480035 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -19,6 +19,10 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF) \ += -fplugin-arg-structleak_plugin-byref gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL) \ += -fplugin-arg-structleak_plugin-byref-all +ifdef CONFIG_GCC_PLUGIN_STRUCTLEAK + DISABLE_STRUCTLEAK_PLUGIN += -fplugin-arg-structleak_plugin-disable +endif +export DISABLE_STRUCTLEAK_PLUGIN gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK) \ += -DSTRUCTLEAK_PLUGIN diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index fd9777f63f14..9dbab13329fa 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -82,10 +82,8 @@ cat << EOF #define __IGNORE_truncate64 #define __IGNORE_stat64 #define __IGNORE_lstat64 -#define __IGNORE_fstat64 #define __IGNORE_fcntl64 #define __IGNORE_fadvise64_64 -#define __IGNORE_fstatat64 #define __IGNORE_fstatfs64 #define __IGNORE_statfs64 #define __IGNORE_llseek @@ -253,6 +251,10 @@ cat << EOF #define __IGNORE_getpmsg #define __IGNORE_putpmsg #define __IGNORE_vserver + +/* 64-bit ports never needed these, and new 32-bit ports can use statx */ +#define __IGNORE_fstat64 +#define __IGNORE_fstatat64 EOF } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 8f6b13ae46bf..7d631aaa0ae1 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -189,7 +189,7 @@ if ($arch =~ /(x86(_64)?)|(i386)/) { $local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; $weak_regex = "^[0-9a-fA-F]+\\s+([wW])\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; -$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; +$function_regex = "^([0-9a-fA-F]+)\\s+<([^^]*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s(mcount|__fentry__)\$"; $section_type = '@progbits'; $mcount_adjust = 0; diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index e3d79a7b6db6..b5d5333ab330 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -918,6 +918,13 @@ void key_change_session_keyring(struct callback_head *twork) return; } + /* If get_ucounts fails more bits are needed in the refcount */ + if (unlikely(!get_ucounts(old->ucounts))) { + WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); + put_cred(new); + return; + } + new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; @@ -927,6 +934,7 @@ void key_change_session_keyring(struct callback_head *twork) new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); + new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index d59276f48d4f..94ea2a8b2bb7 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -126,6 +126,8 @@ static const struct nlmsg_perm nlmsg_xfrm_perms[] = { XFRM_MSG_NEWSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_GETSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_MAPPING, NETLINK_XFRM_SOCKET__NLMSG_READ }, + { XFRM_MSG_SETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, + { XFRM_MSG_GETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_audit_perms[] = @@ -189,7 +191,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_MAPPING); + BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_GETDEFAULT); err = nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms, sizeof(nlmsg_xfrm_perms)); break; diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c index a59de24695ec..dfe5a64e19d2 100644 --- a/sound/core/pcm_compat.c +++ b/sound/core/pcm_compat.c @@ -468,6 +468,76 @@ static int snd_pcm_ioctl_sync_ptr_x32(struct snd_pcm_substream *substream, } #endif /* CONFIG_X86_X32 */ +#ifdef __BIG_ENDIAN +typedef char __pad_before_u32[4]; +typedef char __pad_after_u32[0]; +#else +typedef char __pad_before_u32[0]; +typedef char __pad_after_u32[4]; +#endif + +/* PCM 2.0.15 API definition had a bug in mmap control; it puts the avail_min + * at the wrong offset due to a typo in padding type. + * The bug hits only 32bit. + * A workaround for incorrect read/write is needed only in 32bit compat mode. + */ +struct __snd_pcm_mmap_control64_buggy { + __pad_before_u32 __pad1; + __u32 appl_ptr; + __pad_before_u32 __pad2; /* SiC! here is the bug */ + __pad_before_u32 __pad3; + __u32 avail_min; + __pad_after_uframe __pad4; +}; + +static int snd_pcm_ioctl_sync_ptr_buggy(struct snd_pcm_substream *substream, + struct snd_pcm_sync_ptr __user *_sync_ptr) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + struct snd_pcm_sync_ptr sync_ptr; + struct __snd_pcm_mmap_control64_buggy *sync_cp; + volatile struct snd_pcm_mmap_status *status; + volatile struct snd_pcm_mmap_control *control; + int err; + + memset(&sync_ptr, 0, sizeof(sync_ptr)); + sync_cp = (struct __snd_pcm_mmap_control64_buggy *)&sync_ptr.c.control; + if (get_user(sync_ptr.flags, (unsigned __user *)&(_sync_ptr->flags))) + return -EFAULT; + if (copy_from_user(sync_cp, &(_sync_ptr->c.control), sizeof(*sync_cp))) + return -EFAULT; + status = runtime->status; + control = runtime->control; + if (sync_ptr.flags & SNDRV_PCM_SYNC_PTR_HWSYNC) { + err = snd_pcm_hwsync(substream); + if (err < 0) + return err; + } + snd_pcm_stream_lock_irq(substream); + if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_APPL)) { + err = pcm_lib_apply_appl_ptr(substream, sync_cp->appl_ptr); + if (err < 0) { + snd_pcm_stream_unlock_irq(substream); + return err; + } + } else { + sync_cp->appl_ptr = control->appl_ptr; + } + if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN)) + control->avail_min = sync_cp->avail_min; + else + sync_cp->avail_min = control->avail_min; + sync_ptr.s.status.state = status->state; + sync_ptr.s.status.hw_ptr = status->hw_ptr; + sync_ptr.s.status.tstamp = status->tstamp; + sync_ptr.s.status.suspended_state = status->suspended_state; + sync_ptr.s.status.audio_tstamp = status->audio_tstamp; + snd_pcm_stream_unlock_irq(substream); + if (copy_to_user(_sync_ptr, &sync_ptr, sizeof(sync_ptr))) + return -EFAULT; + return 0; +} + /* */ enum { @@ -537,7 +607,7 @@ static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned l if (in_x32_syscall()) return snd_pcm_ioctl_sync_ptr_x32(substream, argp); #endif /* CONFIG_X86_X32 */ - return snd_pcm_common_ioctl(file, substream, cmd, argp); + return snd_pcm_ioctl_sync_ptr_buggy(substream, argp); case SNDRV_PCM_IOCTL_HW_REFINE32: return snd_pcm_ioctl_hw_params_compat(substream, 1, argp); case SNDRV_PCM_IOCTL_HW_PARAMS32: diff --git a/sound/core/seq_device.c b/sound/core/seq_device.c index 382275c5b193..7f3fd8eb016f 100644 --- a/sound/core/seq_device.c +++ b/sound/core/seq_device.c @@ -156,6 +156,8 @@ static int snd_seq_device_dev_free(struct snd_device *device) struct snd_seq_device *dev = device->device_data; cancel_autoload_drivers(); + if (dev->private_free) + dev->private_free(dev); put_device(&dev->dev); return 0; } @@ -183,11 +185,7 @@ static int snd_seq_device_dev_disconnect(struct snd_device *device) static void snd_seq_dev_release(struct device *dev) { - struct snd_seq_device *sdev = to_seq_dev(dev); - - if (sdev->private_free) - sdev->private_free(sdev); - kfree(sdev); + kfree(to_seq_dev(dev)); } /* diff --git a/sound/hda/hdac_controller.c b/sound/hda/hdac_controller.c index 062da7a7a586..f7bd6e2db085 100644 --- a/sound/hda/hdac_controller.c +++ b/sound/hda/hdac_controller.c @@ -421,8 +421,9 @@ int snd_hdac_bus_reset_link(struct hdac_bus *bus, bool full_reset) if (!full_reset) goto skip_reset; - /* clear STATESTS */ - snd_hdac_chip_writew(bus, STATESTS, STATESTS_INT_MASK); + /* clear STATESTS if not in reset */ + if (snd_hdac_chip_readb(bus, GCTL) & AZX_GCTL_RESET) + snd_hdac_chip_writew(bus, STATESTS, STATESTS_INT_MASK); /* reset controller */ snd_hdac_bus_enter_link_reset(bus); diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c index 2523b23389e9..1c8bffc3eec6 100644 --- a/sound/pci/hda/hda_bind.c +++ b/sound/pci/hda/hda_bind.c @@ -298,29 +298,31 @@ int snd_hda_codec_configure(struct hda_codec *codec) { int err; + if (codec->configured) + return 0; + if (is_generic_config(codec)) codec->probe_id = HDA_CODEC_ID_GENERIC; else codec->probe_id = 0; - err = snd_hdac_device_register(&codec->core); - if (err < 0) - return err; + if (!device_is_registered(&codec->core.dev)) { + err = snd_hdac_device_register(&codec->core); + if (err < 0) + return err; + } if (!codec->preset) codec_bind_module(codec); if (!codec->preset) { err = codec_bind_generic(codec); if (err < 0) { - codec_err(codec, "Unable to bind the codec\n"); - goto error; + codec_dbg(codec, "Unable to bind the codec\n"); + return err; } } + codec->configured = 1; return 0; - - error: - snd_hdac_device_unregister(&codec->core); - return err; } EXPORT_SYMBOL_GPL(snd_hda_codec_configure); diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index a9ebefd60cf6..0c4a337c9fc0 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -791,6 +791,7 @@ void snd_hda_codec_cleanup_for_unbind(struct hda_codec *codec) snd_array_free(&codec->nids); remove_conn_list(codec); snd_hdac_regmap_exit(&codec->core); + codec->configured = 0; } EXPORT_SYMBOL_GPL(snd_hda_codec_cleanup_for_unbind); diff --git a/sound/pci/hda/hda_controller.c b/sound/pci/hda/hda_controller.c index 7cd452831fd3..930ae4002a81 100644 --- a/sound/pci/hda/hda_controller.c +++ b/sound/pci/hda/hda_controller.c @@ -25,6 +25,7 @@ #include <sound/core.h> #include <sound/initval.h> #include "hda_controller.h" +#include "hda_local.h" #define CREATE_TRACE_POINTS #include "hda_controller_trace.h" @@ -1248,17 +1249,24 @@ EXPORT_SYMBOL_GPL(azx_probe_codecs); int azx_codec_configure(struct azx *chip) { struct hda_codec *codec, *next; + int success = 0; - /* use _safe version here since snd_hda_codec_configure() deregisters - * the device upon error and deletes itself from the bus list. - */ - list_for_each_codec_safe(codec, next, &chip->bus) { - snd_hda_codec_configure(codec); + list_for_each_codec(codec, &chip->bus) { + if (!snd_hda_codec_configure(codec)) + success++; } - if (!azx_bus(chip)->num_codecs) - return -ENODEV; - return 0; + if (success) { + /* unregister failed codecs if any codec has been probed */ + list_for_each_codec_safe(codec, next, &chip->bus) { + if (!codec->configured) { + codec_err(codec, "Unable to configure, disabling\n"); + snd_hdac_device_unregister(&codec->core); + } + } + } + + return success ? 0 : -ENODEV; } EXPORT_SYMBOL_GPL(azx_codec_configure); diff --git a/sound/pci/hda/hda_controller.h b/sound/pci/hda/hda_controller.h index 3062f87380b1..f5bf295eb830 100644 --- a/sound/pci/hda/hda_controller.h +++ b/sound/pci/hda/hda_controller.h @@ -41,7 +41,7 @@ /* 24 unused */ #define AZX_DCAPS_COUNT_LPIB_DELAY (1 << 25) /* Take LPIB as delay */ #define AZX_DCAPS_PM_RUNTIME (1 << 26) /* runtime PM support */ -/* 27 unused */ +#define AZX_DCAPS_RETRY_PROBE (1 << 27) /* retry probe if no codec is configured */ #define AZX_DCAPS_CORBRP_SELF_CLEAR (1 << 28) /* CORBRP clears itself after reset */ #define AZX_DCAPS_NO_MSI64 (1 << 29) /* Stick to 32-bit MSIs */ #define AZX_DCAPS_SEPARATE_STREAM_TAG (1 << 30) /* capture and playback use separate stream tag */ diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 47777439961c..4d22e7adeee8 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -307,7 +307,8 @@ enum { /* quirks for AMD SB */ #define AZX_DCAPS_PRESET_AMD_SB \ (AZX_DCAPS_NO_TCSEL | AZX_DCAPS_AMD_WORKAROUND |\ - AZX_DCAPS_SNOOP_TYPE(ATI) | AZX_DCAPS_PM_RUNTIME) + AZX_DCAPS_SNOOP_TYPE(ATI) | AZX_DCAPS_PM_RUNTIME |\ + AZX_DCAPS_RETRY_PROBE) /* quirks for Nvidia */ #define AZX_DCAPS_PRESET_NVIDIA \ @@ -1723,7 +1724,7 @@ static void azx_check_snoop_available(struct azx *chip) static void azx_probe_work(struct work_struct *work) { - struct hda_intel *hda = container_of(work, struct hda_intel, probe_work); + struct hda_intel *hda = container_of(work, struct hda_intel, probe_work.work); azx_probe_continue(&hda->chip); } @@ -1828,7 +1829,7 @@ static int azx_create(struct snd_card *card, struct pci_dev *pci, } /* continue probing in work context as may trigger request module */ - INIT_WORK(&hda->probe_work, azx_probe_work); + INIT_DELAYED_WORK(&hda->probe_work, azx_probe_work); *rchip = chip; @@ -2142,7 +2143,7 @@ static int azx_probe(struct pci_dev *pci, #endif if (schedule_probe) - schedule_work(&hda->probe_work); + schedule_delayed_work(&hda->probe_work, 0); dev++; if (chip->disabled) @@ -2228,6 +2229,11 @@ static int azx_probe_continue(struct azx *chip) int dev = chip->dev_index; int err; + if (chip->disabled || hda->init_failed) + return -EIO; + if (hda->probe_retry) + goto probe_retry; + to_hda_bus(bus)->bus_probing = 1; hda->probe_continued = 1; @@ -2289,10 +2295,20 @@ static int azx_probe_continue(struct azx *chip) #endif } #endif + + probe_retry: if (bus->codec_mask && !(probe_only[dev] & 1)) { err = azx_codec_configure(chip); - if (err < 0) + if (err) { + if ((chip->driver_caps & AZX_DCAPS_RETRY_PROBE) && + ++hda->probe_retry < 60) { + schedule_delayed_work(&hda->probe_work, + msecs_to_jiffies(1000)); + return 0; /* keep things up */ + } + dev_err(chip->card->dev, "Cannot probe codecs, giving up\n"); goto out_free; + } } err = snd_card_register(chip->card); @@ -2322,6 +2338,7 @@ out_free: display_power(chip, false); complete_all(&hda->probe_wait); to_hda_bus(bus)->bus_probing = 0; + hda->probe_retry = 0; return 0; } @@ -2347,7 +2364,7 @@ static void azx_remove(struct pci_dev *pci) * device during cancel_work_sync() call. */ device_unlock(&pci->dev); - cancel_work_sync(&hda->probe_work); + cancel_delayed_work_sync(&hda->probe_work); device_lock(&pci->dev); snd_card_free(card); diff --git a/sound/pci/hda/hda_intel.h b/sound/pci/hda/hda_intel.h index 3fb119f09040..0f39418f9328 100644 --- a/sound/pci/hda/hda_intel.h +++ b/sound/pci/hda/hda_intel.h @@ -14,7 +14,7 @@ struct hda_intel { /* sync probing */ struct completion probe_wait; - struct work_struct probe_work; + struct delayed_work probe_work; /* card list (for power_save trigger) */ struct list_head list; @@ -30,6 +30,8 @@ struct hda_intel { unsigned int freed:1; /* resources already released */ bool need_i915_power:1; /* the hda controller needs i915 power */ + + int probe_retry; /* being probe-retry */ }; #endif diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4407f7da57c4..965b096f416f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -526,6 +526,8 @@ static void alc_shutup_pins(struct hda_codec *codec) struct alc_spec *spec = codec->spec; switch (codec->core.vendor_id) { + case 0x10ec0236: + case 0x10ec0256: case 0x10ec0283: case 0x10ec0286: case 0x10ec0288: @@ -2533,11 +2535,13 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x65d2, "Clevo PB51R[CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65e1, "Clevo PB51[ED][DF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65e5, "Clevo PC50D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x65f1, "Clevo PC50HS", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67d1, "Clevo PB71[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x70d1, "Clevo PC70[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), - SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170SM", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x7715, "Clevo X170KM-G", ALC1220_FIXUP_CLEVO_PB51ED), SND_PCI_QUIRK(0x1558, 0x9501, "Clevo P950HR", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x9506, "Clevo P955HQ", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x950a, "Clevo P955H[PR]", ALC1220_FIXUP_CLEVO_P950), @@ -3528,7 +3532,8 @@ static void alc256_shutup(struct hda_codec *codec) /* If disable 3k pulldown control for alc257, the Mic detection will not work correctly * when booting with headset plugged. So skip setting it for the codec alc257 */ - if (codec->core.vendor_id != 0x10ec0257) + if (spec->codec_variant != ALC269_TYPE_ALC257 && + spec->codec_variant != ALC269_TYPE_ALC256) alc_update_coef_idx(codec, 0x46, 0, 3 << 12); if (!spec->no_shutup_pins) @@ -6401,6 +6406,44 @@ static void alc_fixup_no_int_mic(struct hda_codec *codec, } } +/* GPIO1 = amplifier on/off + * GPIO3 = mic mute LED + */ +static void alc285_fixup_hp_spectre_x360_eb1(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + static const hda_nid_t conn[] = { 0x02 }; + + struct alc_spec *spec = codec->spec; + static const struct hda_pintbl pincfgs[] = { + { 0x14, 0x90170110 }, /* front/high speakers */ + { 0x17, 0x90170130 }, /* back/bass speakers */ + { } + }; + + //enable micmute led + alc_fixup_hp_gpio_led(codec, action, 0x00, 0x04); + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + spec->micmute_led_polarity = 1; + /* needed for amp of back speakers */ + spec->gpio_mask |= 0x01; + spec->gpio_dir |= 0x01; + snd_hda_apply_pincfgs(codec, pincfgs); + /* share DAC to have unified volume control */ + snd_hda_override_conn_list(codec, 0x14, ARRAY_SIZE(conn), conn); + snd_hda_override_conn_list(codec, 0x17, ARRAY_SIZE(conn), conn); + break; + case HDA_FIXUP_ACT_INIT: + /* need to toggle GPIO to enable the amp of back speakers */ + alc_update_gpio_data(codec, 0x01, true); + msleep(100); + alc_update_gpio_data(codec, 0x01, false); + break; + } +} + static void alc285_fixup_hp_spectre_x360(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -6449,6 +6492,24 @@ static void alc287_fixup_legion_15imhg05_speakers(struct hda_codec *codec, /* for alc285_fixup_ideapad_s740_coef() */ #include "ideapad_s740_helper.c" +static void alc256_fixup_tongfang_reset_persistent_settings(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + /* + * A certain other OS sets these coeffs to different values. On at least one TongFang + * barebone these settings might survive even a cold reboot. So to restore a clean slate the + * values are explicitly reset to default here. Without this, the external microphone is + * always in a plugged-in state, while the internal microphone is always in an unplugged + * state, breaking the ability to use the internal microphone. + */ + alc_write_coef_idx(codec, 0x24, 0x0000); + alc_write_coef_idx(codec, 0x26, 0x0000); + alc_write_coef_idx(codec, 0x29, 0x3000); + alc_write_coef_idx(codec, 0x37, 0xfe05); + alc_write_coef_idx(codec, 0x45, 0x5089); +} + enum { ALC269_FIXUP_GPIO2, ALC269_FIXUP_SONY_VAIO, @@ -6535,6 +6596,7 @@ enum { ALC269_FIXUP_HP_DOCK_GPIO_MIC1_LED, ALC280_FIXUP_HP_9480M, ALC245_FIXUP_HP_X360_AMP, + ALC285_FIXUP_HP_SPECTRE_X360_EB1, ALC288_FIXUP_DELL_HEADSET_MODE, ALC288_FIXUP_DELL1_MIC_NO_PRESENCE, ALC288_FIXUP_DELL_XPS_13, @@ -6663,7 +6725,8 @@ enum { ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS, ALC287_FIXUP_LEGION_15IMHG05_AUTOMUTE, ALC287_FIXUP_YOGA7_14ITL_SPEAKERS, - ALC287_FIXUP_13S_GEN2_SPEAKERS + ALC287_FIXUP_13S_GEN2_SPEAKERS, + ALC256_FIXUP_TONGFANG_RESET_PERSISTENT_SETTINGS, }; static const struct hda_fixup alc269_fixups[] = { @@ -8227,6 +8290,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_hp_spectre_x360, }, + [ALC285_FIXUP_HP_SPECTRE_X360_EB1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_hp_spectre_x360_eb1 + }, [ALC287_FIXUP_IDEAPAD_BASS_SPK_AMP] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_ideapad_s740_coef, @@ -8344,7 +8411,7 @@ static const struct hda_fixup alc269_fixups[] = { .v.verbs = (const struct hda_verb[]) { { 0x20, AC_VERB_SET_COEF_INDEX, 0x24 }, { 0x20, AC_VERB_SET_PROC_COEF, 0x41 }, - { 0x20, AC_VERB_SET_PROC_COEF, 0xb020 }, + { 0x20, AC_VERB_SET_COEF_INDEX, 0x26 }, { 0x20, AC_VERB_SET_PROC_COEF, 0x2 }, { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, { 0x20, AC_VERB_SET_PROC_COEF, 0x0 }, @@ -8361,6 +8428,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE, }, + [ALC256_FIXUP_TONGFANG_RESET_PERSISTENT_SETTINGS] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc256_fixup_tongfang_reset_persistent_settings, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8452,6 +8523,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0a30, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a58, "Dell", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a61, "Dell XPS 15 9510", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x0a62, "Dell Precision 5560", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x0a9d, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x0a9e, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), @@ -8554,6 +8628,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x87f7, "HP Spectre x360 14", ALC245_FIXUP_HP_X360_AMP), SND_PCI_QUIRK(0x103c, 0x8805, "HP ProBook 650 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x880d, "HP EliteBook 830 G8 Notebook PC", ALC285_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8811, "HP Spectre x360 15-eb1xxx", ALC285_FIXUP_HP_SPECTRE_X360_EB1), + SND_PCI_QUIRK(0x103c, 0x8812, "HP Spectre x360 15-eb1xxx", ALC285_FIXUP_HP_SPECTRE_X360_EB1), SND_PCI_QUIRK(0x103c, 0x8846, "HP EliteBook 850 G8 Notebook PC", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8847, "HP EliteBook x360 830 G8 Notebook PC", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x884b, "HP EliteBook 840 Aero G8 Notebook PC", ALC285_FIXUP_HP_GPIO_LED), @@ -8789,6 +8865,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802), SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X), + SND_PCI_QUIRK(0x1d05, 0x1132, "TongFang PHxTxX1", ALC256_FIXUP_TONGFANG_RESET_PERSISTENT_SETTINGS), SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), @@ -8974,6 +9051,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC245_FIXUP_HP_X360_AMP, .name = "alc245-hp-x360-amp"}, {.id = ALC295_FIXUP_HP_OMEN, .name = "alc295-hp-omen"}, {.id = ALC285_FIXUP_HP_SPECTRE_X360, .name = "alc285-hp-spectre-x360"}, + {.id = ALC285_FIXUP_HP_SPECTRE_X360_EB1, .name = "alc285-hp-spectre-x360-eb1"}, {.id = ALC287_FIXUP_IDEAPAD_BASS_SPK_AMP, .name = "alc287-ideapad-bass-spk-amp"}, {.id = ALC623_FIXUP_LENOVO_THINKSTATION_P340, .name = "alc623-lenovo-thinkstation-p340"}, {.id = ALC255_FIXUP_ACER_HEADPHONE_AND_MIC, .name = "alc255-acer-headphone-and-mic"}, @@ -10166,6 +10244,9 @@ enum { ALC671_FIXUP_HP_HEADSET_MIC2, ALC662_FIXUP_ACER_X2660G_HEADSET_MODE, ALC662_FIXUP_ACER_NITRO_HEADSET_MODE, + ALC668_FIXUP_ASUS_NO_HEADSET_MIC, + ALC668_FIXUP_HEADSET_MIC, + ALC668_FIXUP_MIC_DET_COEF, }; static const struct hda_fixup alc662_fixups[] = { @@ -10549,6 +10630,29 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC662_FIXUP_USI_FUNC }, + [ALC668_FIXUP_ASUS_NO_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x1b, 0x04a1112c }, + { } + }, + .chained = true, + .chain_id = ALC668_FIXUP_HEADSET_MIC + }, + [ALC668_FIXUP_HEADSET_MIC] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc269_fixup_headset_mic, + .chained = true, + .chain_id = ALC668_FIXUP_MIC_DET_COEF + }, + [ALC668_FIXUP_MIC_DET_COEF] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x15 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0d60 }, + {} + }, + }, }; static const struct snd_pci_quirk alc662_fixup_tbl[] = { @@ -10584,6 +10688,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51), SND_PCI_QUIRK(0x1043, 0x17bd, "ASUS N751", ALC668_FIXUP_ASUS_Nx51), + SND_PCI_QUIRK(0x1043, 0x185d, "ASUS G551JW", ALC668_FIXUP_ASUS_NO_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1963, "ASUS X71SL", ALC662_FIXUP_ASUS_MODE8), SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 82ee233a269d..216cea04ad70 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -1583,6 +1583,7 @@ config SND_SOC_WCD938X_SDW tristate "WCD9380/WCD9385 Codec - SDW" select SND_SOC_WCD938X select SND_SOC_WCD_MBHC + select REGMAP_IRQ depends on SOUNDWIRE select REGMAP_SOUNDWIRE help diff --git a/sound/soc/codecs/cs42l42.c b/sound/soc/codecs/cs42l42.c index fb1e4c33e27d..9a463ab54bdd 100644 --- a/sound/soc/codecs/cs42l42.c +++ b/sound/soc/codecs/cs42l42.c @@ -922,7 +922,6 @@ static int cs42l42_mute_stream(struct snd_soc_dai *dai, int mute, int stream) struct snd_soc_component *component = dai->component; struct cs42l42_private *cs42l42 = snd_soc_component_get_drvdata(component); unsigned int regval; - u8 fullScaleVol; int ret; if (mute) { @@ -993,20 +992,11 @@ static int cs42l42_mute_stream(struct snd_soc_dai *dai, int mute, int stream) cs42l42->stream_use |= 1 << stream; if (stream == SNDRV_PCM_STREAM_PLAYBACK) { - /* Read the headphone load */ - regval = snd_soc_component_read(component, CS42L42_LOAD_DET_RCSTAT); - if (((regval & CS42L42_RLA_STAT_MASK) >> CS42L42_RLA_STAT_SHIFT) == - CS42L42_RLA_STAT_15_OHM) { - fullScaleVol = CS42L42_HP_FULL_SCALE_VOL_MASK; - } else { - fullScaleVol = 0; - } - - /* Un-mute the headphone, set the full scale volume flag */ + /* Un-mute the headphone */ snd_soc_component_update_bits(component, CS42L42_HP_CTL, CS42L42_HP_ANA_AMUTE_MASK | - CS42L42_HP_ANA_BMUTE_MASK | - CS42L42_HP_FULL_SCALE_VOL_MASK, fullScaleVol); + CS42L42_HP_ANA_BMUTE_MASK, + 0); } } diff --git a/sound/soc/codecs/cs4341.c b/sound/soc/codecs/cs4341.c index 7d3e54d8eef3..29d05e32d341 100644 --- a/sound/soc/codecs/cs4341.c +++ b/sound/soc/codecs/cs4341.c @@ -305,12 +305,19 @@ static int cs4341_spi_probe(struct spi_device *spi) return cs4341_probe(&spi->dev); } +static const struct spi_device_id cs4341_spi_ids[] = { + { "cs4341a" }, + { } +}; +MODULE_DEVICE_TABLE(spi, cs4341_spi_ids); + static struct spi_driver cs4341_spi_driver = { .driver = { .name = "cs4341-spi", .of_match_table = of_match_ptr(cs4341_dt_ids), }, .probe = cs4341_spi_probe, + .id_table = cs4341_spi_ids, }; #endif diff --git a/sound/soc/codecs/nau8824.c b/sound/soc/codecs/nau8824.c index db88be48c998..f946ef65a4c1 100644 --- a/sound/soc/codecs/nau8824.c +++ b/sound/soc/codecs/nau8824.c @@ -867,8 +867,8 @@ static void nau8824_jdet_work(struct work_struct *work) struct regmap *regmap = nau8824->regmap; int adc_value, event = 0, event_mask = 0; - snd_soc_dapm_enable_pin(dapm, "MICBIAS"); - snd_soc_dapm_enable_pin(dapm, "SAR"); + snd_soc_dapm_force_enable_pin(dapm, "MICBIAS"); + snd_soc_dapm_force_enable_pin(dapm, "SAR"); snd_soc_dapm_sync(dapm); msleep(100); diff --git a/sound/soc/codecs/pcm179x-spi.c b/sound/soc/codecs/pcm179x-spi.c index 0a542924ec5f..ebf63ea90a1c 100644 --- a/sound/soc/codecs/pcm179x-spi.c +++ b/sound/soc/codecs/pcm179x-spi.c @@ -36,6 +36,7 @@ static const struct of_device_id pcm179x_of_match[] = { MODULE_DEVICE_TABLE(of, pcm179x_of_match); static const struct spi_device_id pcm179x_spi_ids[] = { + { "pcm1792a", 0 }, { "pcm179x", 0 }, { }, }; diff --git a/sound/soc/codecs/pcm512x.c b/sound/soc/codecs/pcm512x.c index 4dc844f3c1fc..60dee41816dc 100644 --- a/sound/soc/codecs/pcm512x.c +++ b/sound/soc/codecs/pcm512x.c @@ -116,6 +116,8 @@ static const struct reg_default pcm512x_reg_defaults[] = { { PCM512x_FS_SPEED_MODE, 0x00 }, { PCM512x_IDAC_1, 0x01 }, { PCM512x_IDAC_2, 0x00 }, + { PCM512x_I2S_1, 0x02 }, + { PCM512x_I2S_2, 0x00 }, }; static bool pcm512x_readable(struct device *dev, unsigned int reg) diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index f0daf8defcf1..52de7d14b139 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -4144,10 +4144,10 @@ static int wcd938x_codec_set_jack(struct snd_soc_component *comp, { struct wcd938x_priv *wcd = dev_get_drvdata(comp->dev); - if (!jack) + if (jack) return wcd_mbhc_start(wcd->wcd_mbhc, &wcd->mbhc_cfg, jack); - - wcd_mbhc_stop(wcd->wcd_mbhc); + else + wcd_mbhc_stop(wcd->wcd_mbhc); return 0; } diff --git a/sound/soc/codecs/wm8960.c b/sound/soc/codecs/wm8960.c index 9e621a254392..499604f1e178 100644 --- a/sound/soc/codecs/wm8960.c +++ b/sound/soc/codecs/wm8960.c @@ -742,9 +742,16 @@ static int wm8960_configure_clocking(struct snd_soc_component *component) int i, j, k; int ret; - if (!(iface1 & (1<<6))) { - dev_dbg(component->dev, - "Codec is slave mode, no need to configure clock\n"); + /* + * For Slave mode clocking should still be configured, + * so this if statement should be removed, but some platform + * may not work if the sysclk is not configured, to avoid such + * compatible issue, just add '!wm8960->sysclk' condition in + * this if statement. + */ + if (!(iface1 & (1 << 6)) && !wm8960->sysclk) { + dev_warn(component->dev, + "slave mode, but proceeding with no clock configuration\n"); return 0; } diff --git a/sound/soc/fsl/fsl_xcvr.c b/sound/soc/fsl/fsl_xcvr.c index 7ba2fd15132d..d0556c79fdb1 100644 --- a/sound/soc/fsl/fsl_xcvr.c +++ b/sound/soc/fsl/fsl_xcvr.c @@ -487,8 +487,9 @@ static int fsl_xcvr_prepare(struct snd_pcm_substream *substream, return ret; } - /* clear DPATH RESET */ + /* set DPATH RESET */ m_ctl |= FSL_XCVR_EXT_CTRL_DPTH_RESET(tx); + v_ctl |= FSL_XCVR_EXT_CTRL_DPTH_RESET(tx); ret = regmap_update_bits(xcvr->regmap, FSL_XCVR_EXT_CTRL, m_ctl, v_ctl); if (ret < 0) { dev_err(dai->dev, "Error while setting EXT_CTRL: %d\n", ret); @@ -590,10 +591,6 @@ static void fsl_xcvr_shutdown(struct snd_pcm_substream *substream, val |= FSL_XCVR_EXT_CTRL_CMDC_RESET(tx); } - /* set DPATH RESET */ - mask |= FSL_XCVR_EXT_CTRL_DPTH_RESET(tx); - val |= FSL_XCVR_EXT_CTRL_DPTH_RESET(tx); - ret = regmap_update_bits(xcvr->regmap, FSL_XCVR_EXT_CTRL, mask, val); if (ret < 0) { dev_err(dai->dev, "Err setting DPATH RESET: %d\n", ret); @@ -643,6 +640,16 @@ static int fsl_xcvr_trigger(struct snd_pcm_substream *substream, int cmd, dev_err(dai->dev, "Failed to enable DMA: %d\n", ret); return ret; } + + /* clear DPATH RESET */ + ret = regmap_update_bits(xcvr->regmap, FSL_XCVR_EXT_CTRL, + FSL_XCVR_EXT_CTRL_DPTH_RESET(tx), + 0); + if (ret < 0) { + dev_err(dai->dev, "Failed to clear DPATH RESET: %d\n", ret); + return ret; + } + break; case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index 055248f104b2..4d313d0d0f23 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -456,12 +456,12 @@ static const struct dmi_system_id byt_cht_es8316_quirk_table[] = { static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) { + struct device *dev = &pdev->dev; static const char * const mic_name[] = { "in1", "in2" }; + struct snd_soc_acpi_mach *mach = dev_get_platdata(dev); struct property_entry props[MAX_NO_PROPS] = {}; struct byt_cht_es8316_private *priv; const struct dmi_system_id *dmi_id; - struct device *dev = &pdev->dev; - struct snd_soc_acpi_mach *mach; struct fwnode_handle *fwnode; const char *platform_name; struct acpi_device *adev; @@ -476,7 +476,6 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - mach = dev->platform_data; /* fix index of codec dai */ for (i = 0; i < ARRAY_SIZE(byt_cht_es8316_dais); i++) { if (!strcmp(byt_cht_es8316_dais[i].codecs->name, @@ -494,7 +493,7 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) put_device(&adev->dev); byt_cht_es8316_dais[dai_index].codecs->name = codec_name; } else { - dev_err(&pdev->dev, "Error cannot find '%s' dev\n", mach->id); + dev_err(dev, "Error cannot find '%s' dev\n", mach->id); return -ENXIO; } @@ -533,11 +532,8 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) /* get the clock */ priv->mclk = devm_clk_get(dev, "pmc_plt_clk_3"); - if (IS_ERR(priv->mclk)) { - ret = PTR_ERR(priv->mclk); - dev_err(dev, "clk_get pmc_plt_clk_3 failed: %d\n", ret); - return ret; - } + if (IS_ERR(priv->mclk)) + return dev_err_probe(dev, PTR_ERR(priv->mclk), "clk_get pmc_plt_clk_3 failed\n"); /* get speaker enable GPIO */ codec_dev = acpi_get_first_physical_node(adev); @@ -567,22 +563,13 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) devm_acpi_dev_add_driver_gpios(codec_dev, byt_cht_es8316_gpios); priv->speaker_en_gpio = - gpiod_get_index(codec_dev, "speaker-enable", 0, - /* see comment in byt_cht_es8316_resume */ - GPIOD_OUT_LOW | GPIOD_FLAGS_BIT_NONEXCLUSIVE); - + gpiod_get_optional(codec_dev, "speaker-enable", + /* see comment in byt_cht_es8316_resume() */ + GPIOD_OUT_LOW | GPIOD_FLAGS_BIT_NONEXCLUSIVE); if (IS_ERR(priv->speaker_en_gpio)) { - ret = PTR_ERR(priv->speaker_en_gpio); - switch (ret) { - case -ENOENT: - priv->speaker_en_gpio = NULL; - break; - default: - dev_err(dev, "get speaker GPIO failed: %d\n", ret); - fallthrough; - case -EPROBE_DEFER: - goto err_put_codec; - } + ret = dev_err_probe(dev, PTR_ERR(priv->speaker_en_gpio), + "get speaker GPIO failed\n"); + goto err_put_codec; } snprintf(components_string, sizeof(components_string), @@ -597,7 +584,7 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) byt_cht_es8316_card.long_name = long_name; #endif - sof_parent = snd_soc_acpi_sof_parent(&pdev->dev); + sof_parent = snd_soc_acpi_sof_parent(dev); /* set card and driver name */ if (sof_parent) { diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index c830e96afba2..80ca260595fd 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -2599,6 +2599,7 @@ int snd_soc_component_initialize(struct snd_soc_component *component, INIT_LIST_HEAD(&component->dai_list); INIT_LIST_HEAD(&component->dobj_list); INIT_LIST_HEAD(&component->card_list); + INIT_LIST_HEAD(&component->list); mutex_init(&component->io_mutex); component->name = fmt_single_name(dev, &component->id); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 7b67f1e19ae9..59d07648a7e7 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2561,6 +2561,7 @@ static int snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, const char *pin, int status) { struct snd_soc_dapm_widget *w = dapm_find_widget(dapm, pin, true); + int ret = 0; dapm_assert_locked(dapm); @@ -2573,13 +2574,14 @@ static int snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, dapm_mark_dirty(w, "pin configuration"); dapm_widget_invalidate_input_paths(w); dapm_widget_invalidate_output_paths(w); + ret = 1; } w->connected = status; if (status == 0) w->force = 0; - return 0; + return ret; } /** @@ -3583,14 +3585,15 @@ int snd_soc_dapm_put_pin_switch(struct snd_kcontrol *kcontrol, { struct snd_soc_card *card = snd_kcontrol_chip(kcontrol); const char *pin = (const char *)kcontrol->private_value; + int ret; if (ucontrol->value.integer.value[0]) - snd_soc_dapm_enable_pin(&card->dapm, pin); + ret = snd_soc_dapm_enable_pin(&card->dapm, pin); else - snd_soc_dapm_disable_pin(&card->dapm, pin); + ret = snd_soc_dapm_disable_pin(&card->dapm, pin); snd_soc_dapm_sync(&card->dapm); - return 0; + return ret; } EXPORT_SYMBOL_GPL(snd_soc_dapm_put_pin_switch); @@ -4023,7 +4026,7 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol, rtd->params_select = ucontrol->value.enumerated.item[0]; - return 0; + return 1; } static void diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index a2ce535df14b..8e030b1c061a 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -1198,6 +1198,13 @@ static void volume_control_quirks(struct usb_mixer_elem_info *cval, cval->res = 1; } break; + case USB_ID(0x1224, 0x2a25): /* Jieli Technology USB PHY 2.0 */ + if (!strcmp(kctl->id.name, "Mic Capture Volume")) { + usb_audio_info(chip, + "set resolution quirk: cval->res = 16\n"); + cval->res = 16; + } + break; } } diff --git a/sound/usb/mixer_scarlett_gen2.c b/sound/usb/mixer_scarlett_gen2.c index 3d5848d5481b..53ebabf42472 100644 --- a/sound/usb/mixer_scarlett_gen2.c +++ b/sound/usb/mixer_scarlett_gen2.c @@ -2450,6 +2450,8 @@ static int scarlett2_update_monitor_other(struct usb_mixer_interface *mixer) err = scarlett2_usb_get_config(mixer, SCARLETT2_CONFIG_TALKBACK_MAP, 1, &bitmap); + if (err < 0) + return err; for (i = 0; i < num_mixes; i++, bitmap >>= 1) private->talkback_map[i] = bitmap & 1; } diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index e03043f7dad3..2af8c68fac27 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -78,6 +78,48 @@ { USB_DEVICE_VENDOR_SPEC(0x041e, 0x3f19) }, /* + * Creative Technology, Ltd Live! Cam Sync HD [VF0770] + * The device advertises 8 formats, but only a rate of 48kHz is honored by the + * hardware and 24 bits give chopped audio, so only report the one working + * combination. + */ +{ + USB_DEVICE(0x041e, 0x4095), + .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) { + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = &(const struct snd_usb_audio_quirk[]) { + { + .ifnum = 2, + .type = QUIRK_AUDIO_STANDARD_MIXER, + }, + { + .ifnum = 3, + .type = QUIRK_AUDIO_FIXED_ENDPOINT, + .data = &(const struct audioformat) { + .formats = SNDRV_PCM_FMTBIT_S16_LE, + .channels = 2, + .fmt_bits = 16, + .iface = 3, + .altsetting = 4, + .altset_idx = 4, + .endpoint = 0x82, + .ep_attr = 0x05, + .rates = SNDRV_PCM_RATE_48000, + .rate_min = 48000, + .rate_max = 48000, + .nr_rates = 1, + .rate_table = (unsigned int[]) { 48000 }, + }, + }, + { + .ifnum = -1 + }, + }, + }, +}, + +/* * HP Wireless Audio * When not ignored, causes instability issues for some users, forcing them to * skip the entire module. @@ -3970,6 +4012,38 @@ YAMAHA_DEVICE(0x7010, "UB99"), } } }, +{ + /* + * Sennheiser GSP670 + * Change order of interfaces loaded + */ + USB_DEVICE(0x1395, 0x0300), + .bInterfaceClass = USB_CLASS_PER_INTERFACE, + .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) { + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = &(const struct snd_usb_audio_quirk[]) { + // Communication + { + .ifnum = 3, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + // Recording + { + .ifnum = 4, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + // Main + { + .ifnum = 1, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = -1 + } + } + } +}, #undef USB_DEVICE_VENDOR_SPEC #undef USB_AUDIO_DEVICE diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 6ee6d24c847f..8929d9abe8aa 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1719,6 +1719,11 @@ void snd_usb_audioformat_attributes_quirk(struct snd_usb_audio *chip, */ fp->attributes &= ~UAC_EP_CS_ATTR_FILL_MAX; break; + case USB_ID(0x1224, 0x2a25): /* Jieli Technology USB PHY 2.0 */ + /* mic works only when ep packet size is set to wMaxPacketSize */ + fp->attributes |= UAC_EP_CS_ATTR_FILL_MAX; + break; + } } @@ -1884,10 +1889,14 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x2912, 0x30c8, /* Audioengine D1 */ QUIRK_FLAG_GET_SAMPLE_RATE), + DEVICE_FLG(0x30be, 0x0101, /* Schiit Hel */ + QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x413c, 0xa506, /* Dell AE515 sound bar */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x534d, 0x2109, /* MacroSilicon MS2109 */ QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x1224, 0x2a25, /* Jieli Technology USB PHY 2.0 */ + QUIRK_FLAG_GET_SAMPLE_RATE), /* Vendor matches */ VENDOR_FLG(0x045e, /* MS Lifecam */ @@ -1900,6 +1909,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY), VENDOR_FLG(0x07fd, /* MOTU */ QUIRK_FLAG_VALIDATE_RATES), + VENDOR_FLG(0x1235, /* Focusrite Novation */ + QUIRK_FLAG_VALIDATE_RATES), VENDOR_FLG(0x152a, /* Thesycon devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x1de7, /* Phoenix Audio */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index f92880a15645..2fc09579e24a 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1210,14 +1210,16 @@ union perf_mem_data_src { mem_remote:1, /* remote */ mem_snoopx:2, /* snoop mode, ext */ mem_blk:3, /* access blocked */ - mem_rsvd:21; + mem_hops:3, /* hop level */ + mem_rsvd:18; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:21, + __u64 mem_rsvd:18, + mem_hops:3, /* hop level */ mem_blk:3, /* access blocked */ mem_snoopx:2, /* snoop mode, ext */ mem_remote:1, /* remote */ @@ -1241,7 +1243,13 @@ union perf_mem_data_src { #define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ #define PERF_MEM_OP_SHIFT 0 -/* memory hierarchy (memory level, hit or miss) */ +/* + * PERF_MEM_LVL_* namespace being depricated to some extent in the + * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. + * Supporting this namespace inorder to not break defined ABIs. + * + * memory hierarchy (memory level, hit or miss) + */ #define PERF_MEM_LVL_NA 0x01 /* not available */ #define PERF_MEM_LVL_HIT 0x02 /* hit level */ #define PERF_MEM_LVL_MISS 0x04 /* miss level */ @@ -1307,6 +1315,11 @@ union perf_mem_data_src { #define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ #define PERF_MEM_BLK_SHIFT 40 +/* hop level */ +#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ +/* 2-7 available */ +#define PERF_MEM_HOPS_SHIFT 43 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index 1d84ec9db93b..5859ca0a1439 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -784,6 +784,7 @@ struct snd_rawmidi_status { #define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) #define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) +#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) #define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) #define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) #define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index b0bf56c5f120..5a5bd74f55bd 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -742,7 +742,7 @@ class DebugfsProvider(Provider): The fields are all available KVM debugfs files """ - exempt_list = ['halt_poll_fail_ns', 'halt_poll_success_ns'] + exempt_list = ['halt_poll_fail_ns', 'halt_poll_success_ns', 'halt_wait_ns'] fields = [field for field in self.walkdir(PATH_DEBUGFS_KVM)[2] if field not in exempt_list] diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 88d8825fc6f6..e4f83c304ec9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6894,7 +6894,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) if (obj->gen_loader) { /* reset FDs */ - btf__set_fd(obj->btf, -1); + if (obj->btf) + btf__set_fd(obj->btf, -1); for (i = 0; i < obj->nr_maps; i++) obj->maps[i].fd = -1; if (!err) diff --git a/tools/lib/bpf/strset.c b/tools/lib/bpf/strset.c index 1fb8b49de1d6..ea655318153f 100644 --- a/tools/lib/bpf/strset.c +++ b/tools/lib/bpf/strset.c @@ -88,6 +88,7 @@ void strset__free(struct strset *set) hashmap__free(set->strs_hash); free(set->strs_data); + free(set); } size_t strset__data_size(const struct strset *set) diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c index c67c83399170..ce91a582f0e4 100644 --- a/tools/lib/perf/tests/test-evlist.c +++ b/tools/lib/perf/tests/test-evlist.c @@ -40,7 +40,7 @@ static int test_stat_cpu(void) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK, }; - int err, cpu, tmp; + int err, idx; cpus = perf_cpu_map__new(NULL); __T("failed to create cpus", cpus); @@ -70,10 +70,10 @@ static int test_stat_cpu(void) perf_evlist__for_each_evsel(evlist, evsel) { cpus = perf_evsel__cpus(evsel); - perf_cpu_map__for_each_cpu(cpu, tmp, cpus) { + for (idx = 0; idx < perf_cpu_map__nr(cpus); idx++) { struct perf_counts_values counts = { .val = 0 }; - perf_evsel__read(evsel, cpu, 0, &counts); + perf_evsel__read(evsel, idx, 0, &counts); __T("failed to read value for evsel", counts.val != 0); } } diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index a184e4861627..33ae9334861a 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -22,7 +22,7 @@ static int test_stat_cpu(void) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, }; - int err, cpu, tmp; + int err, idx; cpus = perf_cpu_map__new(NULL); __T("failed to create cpus", cpus); @@ -33,10 +33,10 @@ static int test_stat_cpu(void) err = perf_evsel__open(evsel, cpus, NULL); __T("failed to open evsel", err == 0); - perf_cpu_map__for_each_cpu(cpu, tmp, cpus) { + for (idx = 0; idx < perf_cpu_map__nr(cpus); idx++) { struct perf_counts_values counts = { .val = 0 }; - perf_evsel__read(evsel, cpu, 0, &counts); + perf_evsel__read(evsel, idx, 0, &counts); __T("failed to read value for evsel", counts.val != 0); } @@ -148,6 +148,7 @@ static int test_stat_user_read(int event) __T("failed to mmap evsel", err == 0); pc = perf_evsel__mmap_base(evsel, 0, 0); + __T("failed to get mmapped address", pc); #if defined(__i386__) || defined(__x86_64__) __T("userspace counter access not supported", pc->cap_user_rdpmc); diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index bc821056aba9..4d6d7fc13255 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -20,6 +20,7 @@ #include <objtool/arch.h> #include <objtool/warn.h> #include <objtool/endianness.h> +#include <objtool/builtin.h> #include <arch/elf.h> static int is_x86_64(const struct elf *elf) @@ -102,12 +103,13 @@ unsigned long arch_jump_destination(struct instruction *insn) #define rm_is_mem(reg) (mod_is_mem() && !is_RIP() && rm_is(reg)) #define rm_is_reg(reg) (mod_is_reg() && modrm_rm == (reg)) -int arch_decode_instruction(const struct elf *elf, const struct section *sec, +int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, unsigned long *immediate, struct list_head *ops_list) { + const struct elf *elf = file->elf; struct insn insn; int x86_64, ret; unsigned char op1, op2, @@ -544,6 +546,36 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, *type = INSN_RETURN; break; + case 0xc7: /* mov imm, r/m */ + if (!noinstr) + break; + + if (insn.length == 3+4+4 && !strncmp(sec->name, ".init.text", 10)) { + struct reloc *immr, *disp; + struct symbol *func; + int idx; + + immr = find_reloc_by_dest(elf, (void *)sec, offset+3); + disp = find_reloc_by_dest(elf, (void *)sec, offset+7); + + if (!immr || strcmp(immr->sym->name, "pv_ops")) + break; + + idx = (immr->addend + 8) / sizeof(void *); + + func = disp->sym; + if (disp->sym->type == STT_SECTION) + func = find_symbol_by_offset(disp->sym->sec, disp->addend); + if (!func) { + WARN("no func for pv_ops[]"); + return -1; + } + + objtool_pv_add(file, idx, func); + } + + break; + case 0xcf: /* iret */ /* * Handle sync_core(), which has an IRET to self. @@ -659,154 +691,52 @@ const char *arch_nop_insn(int len) return nops[len-1]; } -/* asm/alternative.h ? */ - -#define ALTINSTR_FLAG_INV (1 << 15) -#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) - -struct alt_instr { - s32 instr_offset; /* original instruction */ - s32 repl_offset; /* offset to replacement instruction */ - u16 cpuid; /* cpuid bit set for replacement */ - u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction */ -} __packed; - -static int elf_add_alternative(struct elf *elf, - struct instruction *orig, struct symbol *sym, - int cpuid, u8 orig_len, u8 repl_len) -{ - const int size = sizeof(struct alt_instr); - struct alt_instr *alt; - struct section *sec; - Elf_Scn *s; - - sec = find_section_by_name(elf, ".altinstructions"); - if (!sec) { - sec = elf_create_section(elf, ".altinstructions", - SHF_ALLOC, size, 0); - - if (!sec) { - WARN_ELF("elf_create_section"); - return -1; - } - } - - s = elf_getscn(elf->elf, sec->idx); - if (!s) { - WARN_ELF("elf_getscn"); - return -1; - } - - sec->data = elf_newdata(s); - if (!sec->data) { - WARN_ELF("elf_newdata"); - return -1; - } - - sec->data->d_size = size; - sec->data->d_align = 1; - - alt = sec->data->d_buf = malloc(size); - if (!sec->data->d_buf) { - perror("malloc"); - return -1; - } - memset(sec->data->d_buf, 0, size); - - if (elf_add_reloc_to_insn(elf, sec, sec->sh.sh_size, - R_X86_64_PC32, orig->sec, orig->offset)) { - WARN("elf_create_reloc: alt_instr::instr_offset"); - return -1; - } - - if (elf_add_reloc(elf, sec, sec->sh.sh_size + 4, - R_X86_64_PC32, sym, 0)) { - WARN("elf_create_reloc: alt_instr::repl_offset"); - return -1; - } - - alt->cpuid = bswap_if_needed(cpuid); - alt->instrlen = orig_len; - alt->replacementlen = repl_len; - - sec->sh.sh_size += size; - sec->changed = true; - - return 0; -} - -#define X86_FEATURE_RETPOLINE ( 7*32+12) +#define BYTE_RET 0xC3 -int arch_rewrite_retpolines(struct objtool_file *file) +const char *arch_ret_insn(int len) { - struct instruction *insn; - struct reloc *reloc; - struct symbol *sym; - char name[32] = ""; - - list_for_each_entry(insn, &file->retpoline_call_list, call_node) { - - if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC) - continue; - - if (!strcmp(insn->sec->name, ".text.__x86.indirect_thunk")) - continue; - - reloc = insn->reloc; - - sprintf(name, "__x86_indirect_alt_%s_%s", - insn->type == INSN_JUMP_DYNAMIC ? "jmp" : "call", - reloc->sym->name + 21); - - sym = find_symbol_by_name(file->elf, name); - if (!sym) { - sym = elf_create_undef_symbol(file->elf, name); - if (!sym) { - WARN("elf_create_undef_symbol"); - return -1; - } - } + static const char ret[5][5] = { + { BYTE_RET }, + { BYTE_RET, BYTES_NOP1 }, + { BYTE_RET, BYTES_NOP2 }, + { BYTE_RET, BYTES_NOP3 }, + { BYTE_RET, BYTES_NOP4 }, + }; - if (elf_add_alternative(file->elf, insn, sym, - ALT_NOT(X86_FEATURE_RETPOLINE), 5, 5)) { - WARN("elf_add_alternative"); - return -1; - } + if (len < 1 || len > 5) { + WARN("invalid RET size: %d\n", len); + return NULL; } - return 0; + return ret[len-1]; } -int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg) +int arch_decode_hint_reg(u8 sp_reg, int *base) { - struct cfi_reg *cfa = &insn->cfi.cfa; - switch (sp_reg) { case ORC_REG_UNDEFINED: - cfa->base = CFI_UNDEFINED; + *base = CFI_UNDEFINED; break; case ORC_REG_SP: - cfa->base = CFI_SP; + *base = CFI_SP; break; case ORC_REG_BP: - cfa->base = CFI_BP; + *base = CFI_BP; break; case ORC_REG_SP_INDIRECT: - cfa->base = CFI_SP_INDIRECT; + *base = CFI_SP_INDIRECT; break; case ORC_REG_R10: - cfa->base = CFI_R10; + *base = CFI_R10; break; case ORC_REG_R13: - cfa->base = CFI_R13; + *base = CFI_R13; break; case ORC_REG_DI: - cfa->base = CFI_DI; + *base = CFI_DI; break; case ORC_REG_DX: - cfa->base = CFI_DX; + *base = CFI_DX; break; default: return -1; diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e5947fbb9e7a..fb3f251ea021 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -5,6 +5,7 @@ #include <string.h> #include <stdlib.h> +#include <sys/mman.h> #include <arch/elf.h> #include <objtool/builtin.h> @@ -26,7 +27,11 @@ struct alternative { bool skip_orig; }; -struct cfi_init_state initial_func_cfi; +static unsigned long nr_cfi, nr_cfi_reused, nr_cfi_cache; + +static struct cfi_init_state initial_func_cfi; +static struct cfi_state init_cfi; +static struct cfi_state func_cfi; struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset) @@ -173,6 +178,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "rewind_stack_do_exit", "kunit_try_catch_throw", "xen_start_kernel", + "cpu_bringup_and_idle", }; if (!func) @@ -265,6 +271,78 @@ static void init_insn_state(struct insn_state *state, struct section *sec) state->noinstr = sec->noinstr; } +static struct cfi_state *cfi_alloc(void) +{ + struct cfi_state *cfi = calloc(sizeof(struct cfi_state), 1); + if (!cfi) { + WARN("calloc failed"); + exit(1); + } + nr_cfi++; + return cfi; +} + +static int cfi_bits; +static struct hlist_head *cfi_hash; + +static inline bool cficmp(struct cfi_state *cfi1, struct cfi_state *cfi2) +{ + return memcmp((void *)cfi1 + sizeof(cfi1->hash), + (void *)cfi2 + sizeof(cfi2->hash), + sizeof(struct cfi_state) - sizeof(struct hlist_node)); +} + +static inline u32 cfi_key(struct cfi_state *cfi) +{ + return jhash((void *)cfi + sizeof(cfi->hash), + sizeof(*cfi) - sizeof(cfi->hash), 0); +} + +static struct cfi_state *cfi_hash_find_or_add(struct cfi_state *cfi) +{ + struct hlist_head *head = &cfi_hash[hash_min(cfi_key(cfi), cfi_bits)]; + struct cfi_state *obj; + + hlist_for_each_entry(obj, head, hash) { + if (!cficmp(cfi, obj)) { + nr_cfi_cache++; + return obj; + } + } + + obj = cfi_alloc(); + *obj = *cfi; + hlist_add_head(&obj->hash, head); + + return obj; +} + +static void cfi_hash_add(struct cfi_state *cfi) +{ + struct hlist_head *head = &cfi_hash[hash_min(cfi_key(cfi), cfi_bits)]; + + hlist_add_head(&cfi->hash, head); +} + +static void *cfi_hash_alloc(unsigned long size) +{ + cfi_bits = max(10, ilog2(size)); + cfi_hash = mmap(NULL, sizeof(struct hlist_head) << cfi_bits, + PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (cfi_hash == (void *)-1L) { + WARN("mmap fail cfi_hash"); + cfi_hash = NULL; + } else if (stats) { + printf("cfi_bits: %d\n", cfi_bits); + } + + return cfi_hash; +} + +static unsigned long nr_insns; +static unsigned long nr_insns_visited; + /* * Call the arch-specific instruction decoder for all the instructions and add * them to the global instruction list. @@ -275,7 +353,6 @@ static int decode_instructions(struct objtool_file *file) struct symbol *func; unsigned long offset; struct instruction *insn; - unsigned long nr_insns = 0; int ret; for_each_sec(file, sec) { @@ -292,7 +369,7 @@ static int decode_instructions(struct objtool_file *file) !strcmp(sec->name, ".entry.text")) sec->noinstr = true; - for (offset = 0; offset < sec->len; offset += insn->len) { + for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { insn = malloc(sizeof(*insn)); if (!insn) { WARN("malloc failed"); @@ -301,13 +378,12 @@ static int decode_instructions(struct objtool_file *file) memset(insn, 0, sizeof(*insn)); INIT_LIST_HEAD(&insn->alts); INIT_LIST_HEAD(&insn->stack_ops); - init_cfi_state(&insn->cfi); insn->sec = sec; insn->offset = offset; - ret = arch_decode_instruction(file->elf, sec, offset, - sec->len - offset, + ret = arch_decode_instruction(file, sec, offset, + sec->sh.sh_size - offset, &insn->len, &insn->type, &insn->immediate, &insn->stack_ops); @@ -344,14 +420,90 @@ err: return ret; } +/* + * Read the pv_ops[] .data table to find the static initialized values. + */ +static int add_pv_ops(struct objtool_file *file, const char *symname) +{ + struct symbol *sym, *func; + unsigned long off, end; + struct reloc *rel; + int idx; + + sym = find_symbol_by_name(file->elf, symname); + if (!sym) + return 0; + + off = sym->offset; + end = off + sym->len; + for (;;) { + rel = find_reloc_by_dest_range(file->elf, sym->sec, off, end - off); + if (!rel) + break; + + func = rel->sym; + if (func->type == STT_SECTION) + func = find_symbol_by_offset(rel->sym->sec, rel->addend); + + idx = (rel->offset - sym->offset) / sizeof(unsigned long); + + objtool_pv_add(file, idx, func); + + off = rel->offset + 1; + if (off > end) + break; + } + + return 0; +} + +/* + * Allocate and initialize file->pv_ops[]. + */ +static int init_pv_ops(struct objtool_file *file) +{ + static const char *pv_ops_tables[] = { + "pv_ops", + "xen_cpu_ops", + "xen_irq_ops", + "xen_mmu_ops", + NULL, + }; + const char *pv_ops; + struct symbol *sym; + int idx, nr; + + if (!noinstr) + return 0; + + file->pv_ops = NULL; + + sym = find_symbol_by_name(file->elf, "pv_ops"); + if (!sym) + return 0; + + nr = sym->len / sizeof(unsigned long); + file->pv_ops = calloc(sizeof(struct pv_state), nr); + if (!file->pv_ops) + return -1; + + for (idx = 0; idx < nr; idx++) + INIT_LIST_HEAD(&file->pv_ops[idx].targets); + + for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) + add_pv_ops(file, pv_ops); + + return 0; +} + static struct instruction *find_last_insn(struct objtool_file *file, struct section *sec) { struct instruction *insn = NULL; unsigned int offset; - unsigned int end = (sec->len > 10) ? sec->len - 10 : 0; + unsigned int end = (sec->sh.sh_size > 10) ? sec->sh.sh_size - 10 : 0; - for (offset = sec->len - 1; offset >= end && !insn; offset--) + for (offset = sec->sh.sh_size - 1; offset >= end && !insn; offset--) insn = find_insn(file, sec, offset); return insn; @@ -389,7 +541,7 @@ static int add_dead_ends(struct objtool_file *file) insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (reloc->addend == reloc->sym->sec->len) { + else if (reloc->addend == reloc->sym->sec->sh.sh_size) { insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find unreachable insn at %s+0x%x", @@ -424,7 +576,7 @@ reachable: insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (reloc->addend == reloc->sym->sec->len) { + else if (reloc->addend == reloc->sym->sec->sh.sh_size) { insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find reachable insn at %s+0x%x", @@ -531,6 +683,52 @@ static int create_static_call_sections(struct objtool_file *file) return 0; } +static int create_retpoline_sites_sections(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + int idx; + + sec = find_section_by_name(file->elf, ".retpoline_sites"); + if (sec) { + WARN("file already has .retpoline_sites, skipping"); + return 0; + } + + idx = 0; + list_for_each_entry(insn, &file->retpoline_call_list, call_node) + idx++; + + if (!idx) + return 0; + + sec = elf_create_section(file->elf, ".retpoline_sites", 0, + sizeof(int), idx); + if (!sec) { + WARN("elf_create_section: .retpoline_sites"); + return -1; + } + + idx = 0; + list_for_each_entry(insn, &file->retpoline_call_list, call_node) { + + int *site = (int *)sec->data->d_buf + idx; + *site = 0; + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(int), + R_X86_64_PC32, + insn->sec, insn->offset)) { + WARN("elf_add_reloc_to_insn: .retpoline_sites"); + return -1; + } + + idx++; + } + + return 0; +} + static int create_mcount_loc_sections(struct objtool_file *file) { struct section *sec; @@ -549,7 +747,7 @@ static int create_mcount_loc_sections(struct objtool_file *file) return 0; idx = 0; - list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) + list_for_each_entry(insn, &file->mcount_loc_list, call_node) idx++; sec = elf_create_section(file->elf, "__mcount_loc", 0, sizeof(unsigned long), idx); @@ -557,7 +755,7 @@ static int create_mcount_loc_sections(struct objtool_file *file) return -1; idx = 0; - list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) { + list_for_each_entry(insn, &file->mcount_loc_list, call_node) { loc = (unsigned long *)sec->data->d_buf + idx; memset(loc, 0, sizeof(unsigned long)); @@ -817,6 +1015,9 @@ static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *i return NULL; if (!insn->reloc) { + if (!file) + return NULL; + insn->reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); if (!insn->reloc) { @@ -828,6 +1029,136 @@ static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *i return insn->reloc; } +static void remove_insn_ops(struct instruction *insn) +{ + struct stack_op *op, *tmp; + + list_for_each_entry_safe(op, tmp, &insn->stack_ops, list) { + list_del(&op->list); + free(op); + } +} + +static void annotate_call_site(struct objtool_file *file, + struct instruction *insn, bool sibling) +{ + struct reloc *reloc = insn_reloc(file, insn); + struct symbol *sym = insn->call_dest; + + if (!sym) + sym = reloc->sym; + + /* + * Alternative replacement code is just template code which is + * sometimes copied to the original instruction. For now, don't + * annotate it. (In the future we might consider annotating the + * original instruction if/when it ever makes sense to do so.) + */ + if (!strcmp(insn->sec->name, ".altinstr_replacement")) + return; + + if (sym->static_call_tramp) { + list_add_tail(&insn->call_node, &file->static_call_list); + return; + } + + if (sym->retpoline_thunk) { + list_add_tail(&insn->call_node, &file->retpoline_call_list); + return; + } + + /* + * Many compilers cannot disable KCOV with a function attribute + * so they need a little help, NOP out any KCOV calls from noinstr + * text. + */ + if (insn->sec->noinstr && sym->kcov) { + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } + + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + sibling ? arch_ret_insn(insn->len) + : arch_nop_insn(insn->len)); + + insn->type = sibling ? INSN_RETURN : INSN_NOP; + return; + } + + if (mcount && sym->fentry) { + if (sibling) + WARN_FUNC("Tail call to __fentry__ !?!?", insn->sec, insn->offset); + + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } + + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + arch_nop_insn(insn->len)); + + insn->type = INSN_NOP; + + list_add_tail(&insn->call_node, &file->mcount_loc_list); + return; + } +} + +static void add_call_dest(struct objtool_file *file, struct instruction *insn, + struct symbol *dest, bool sibling) +{ + insn->call_dest = dest; + if (!dest) + return; + + /* + * Whatever stack impact regular CALLs have, should be undone + * by the RETURN of the called function. + * + * Annotated intra-function calls retain the stack_ops but + * are converted to JUMP, see read_intra_function_calls(). + */ + remove_insn_ops(insn); + + annotate_call_site(file, insn, sibling); +} + +static void add_retpoline_call(struct objtool_file *file, struct instruction *insn) +{ + /* + * Retpoline calls/jumps are really dynamic calls/jumps in disguise, + * so convert them accordingly. + */ + switch (insn->type) { + case INSN_CALL: + insn->type = INSN_CALL_DYNAMIC; + break; + case INSN_JUMP_UNCONDITIONAL: + insn->type = INSN_JUMP_DYNAMIC; + break; + case INSN_JUMP_CONDITIONAL: + insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL; + break; + default: + return; + } + + insn->retpoline_safe = true; + + /* + * Whatever stack impact regular CALLs have, should be undone + * by the RETURN of the called function. + * + * Annotated intra-function calls retain the stack_ops but + * are converted to JUMP, see read_intra_function_calls(). + */ + remove_insn_ops(insn); + + annotate_call_site(file, insn, false); +} /* * Find the destination instructions for all jumps. */ @@ -849,28 +1180,12 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->type == STT_SECTION) { dest_sec = reloc->sym->sec; dest_off = arch_dest_reloc_offset(reloc->addend); - } else if (arch_is_retpoline(reloc->sym)) { - /* - * Retpoline jumps are really dynamic jumps in - * disguise, so convert them accordingly. - */ - if (insn->type == INSN_JUMP_UNCONDITIONAL) - insn->type = INSN_JUMP_DYNAMIC; - else - insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL; - - list_add_tail(&insn->call_node, - &file->retpoline_call_list); - - insn->retpoline_safe = true; + } else if (reloc->sym->retpoline_thunk) { + add_retpoline_call(file, insn); continue; } else if (insn->func) { /* internal or external sibling call (with reloc) */ - insn->call_dest = reloc->sym; - if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->call_node, - &file->static_call_list); - } + add_call_dest(file, insn, reloc->sym, true); continue; } else if (reloc->sym->sec->idx) { dest_sec = reloc->sym->sec; @@ -926,13 +1241,8 @@ static int add_jump_destinations(struct objtool_file *file) } else if (insn->jump_dest->func->pfunc != insn->func->pfunc && insn->jump_dest->offset == insn->jump_dest->func->offset) { - /* internal sibling call (without reloc) */ - insn->call_dest = insn->jump_dest->func; - if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->call_node, - &file->static_call_list); - } + add_call_dest(file, insn, insn->jump_dest->func, true); } } } @@ -940,16 +1250,6 @@ static int add_jump_destinations(struct objtool_file *file) return 0; } -static void remove_insn_ops(struct instruction *insn) -{ - struct stack_op *op, *tmp; - - list_for_each_entry_safe(op, tmp, &insn->stack_ops, list) { - list_del(&op->list); - free(op); - } -} - static struct symbol *find_call_destination(struct section *sec, unsigned long offset) { struct symbol *call_dest; @@ -968,6 +1268,7 @@ static int add_call_destinations(struct objtool_file *file) { struct instruction *insn; unsigned long dest_off; + struct symbol *dest; struct reloc *reloc; for_each_insn(file, insn) { @@ -977,7 +1278,9 @@ static int add_call_destinations(struct objtool_file *file) reloc = insn_reloc(file, insn); if (!reloc) { dest_off = arch_jump_destination(insn); - insn->call_dest = find_call_destination(insn->sec, dest_off); + dest = find_call_destination(insn->sec, dest_off); + + add_call_dest(file, insn, dest, false); if (insn->ignore) continue; @@ -995,9 +1298,8 @@ static int add_call_destinations(struct objtool_file *file) } else if (reloc->sym->type == STT_SECTION) { dest_off = arch_dest_reloc_offset(reloc->addend); - insn->call_dest = find_call_destination(reloc->sym->sec, - dest_off); - if (!insn->call_dest) { + dest = find_call_destination(reloc->sym->sec, dest_off); + if (!dest) { WARN_FUNC("can't find call dest symbol at %s+0x%lx", insn->sec, insn->offset, reloc->sym->sec->name, @@ -1005,70 +1307,13 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - } else if (arch_is_retpoline(reloc->sym)) { - /* - * Retpoline calls are really dynamic calls in - * disguise, so convert them accordingly. - */ - insn->type = INSN_CALL_DYNAMIC; - insn->retpoline_safe = true; + add_call_dest(file, insn, dest, false); - list_add_tail(&insn->call_node, - &file->retpoline_call_list); - - remove_insn_ops(insn); - continue; + } else if (reloc->sym->retpoline_thunk) { + add_retpoline_call(file, insn); } else - insn->call_dest = reloc->sym; - - if (insn->call_dest && insn->call_dest->static_call_tramp) { - list_add_tail(&insn->call_node, - &file->static_call_list); - } - - /* - * Many compilers cannot disable KCOV with a function attribute - * so they need a little help, NOP out any KCOV calls from noinstr - * text. - */ - if (insn->sec->noinstr && - !strncmp(insn->call_dest->name, "__sanitizer_cov_", 16)) { - if (reloc) { - reloc->type = R_NONE; - elf_write_reloc(file->elf, reloc); - } - - elf_write_insn(file->elf, insn->sec, - insn->offset, insn->len, - arch_nop_insn(insn->len)); - insn->type = INSN_NOP; - } - - if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) { - if (reloc) { - reloc->type = R_NONE; - elf_write_reloc(file->elf, reloc); - } - - elf_write_insn(file->elf, insn->sec, - insn->offset, insn->len, - arch_nop_insn(insn->len)); - - insn->type = INSN_NOP; - - list_add_tail(&insn->mcount_loc_node, - &file->mcount_loc_list); - } - - /* - * Whatever stack impact regular CALLs have, should be undone - * by the RETURN of the called function. - * - * Annotated intra-function calls retain the stack_ops but - * are converted to JUMP, see read_intra_function_calls(). - */ - remove_insn_ops(insn); + add_call_dest(file, insn, reloc->sym, false); } return 0; @@ -1136,7 +1381,6 @@ static int handle_group_alt(struct objtool_file *file, memset(nop, 0, sizeof(*nop)); INIT_LIST_HEAD(&nop->alts); INIT_LIST_HEAD(&nop->stack_ops); - init_cfi_state(&nop->cfi); nop->sec = special_alt->new_sec; nop->offset = special_alt->new_off + special_alt->new_len; @@ -1545,10 +1789,11 @@ static void set_func_state(struct cfi_state *state) static int read_unwind_hints(struct objtool_file *file) { + struct cfi_state cfi = init_cfi; struct section *sec, *relocsec; - struct reloc *reloc; struct unwind_hint *hint; struct instruction *insn; + struct reloc *reloc; int i; sec = find_section_by_name(file->elf, ".discard.unwind_hints"); @@ -1561,14 +1806,14 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - if (sec->len % sizeof(struct unwind_hint)) { + if (sec->sh.sh_size % sizeof(struct unwind_hint)) { WARN("struct unwind_hint size mismatch"); return -1; } file->hints = true; - for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) { + for (i = 0; i < sec->sh.sh_size / sizeof(struct unwind_hint); i++) { hint = (struct unwind_hint *)sec->data->d_buf + i; reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint)); @@ -1586,19 +1831,24 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; if (hint->type == UNWIND_HINT_TYPE_FUNC) { - set_func_state(&insn->cfi); + insn->cfi = &func_cfi; continue; } - if (arch_decode_hint_reg(insn, hint->sp_reg)) { + if (insn->cfi) + cfi = *(insn->cfi); + + if (arch_decode_hint_reg(hint->sp_reg, &cfi.cfa.base)) { WARN_FUNC("unsupported unwind_hint sp base reg %d", insn->sec, insn->offset, hint->sp_reg); return -1; } - insn->cfi.cfa.offset = bswap_if_needed(hint->sp_offset); - insn->cfi.type = hint->type; - insn->cfi.end = hint->end; + cfi.cfa.offset = bswap_if_needed(hint->sp_offset); + cfi.type = hint->type; + cfi.end = hint->end; + + insn->cfi = cfi_hash_find_or_add(&cfi); } return 0; @@ -1737,17 +1987,28 @@ static int read_intra_function_calls(struct objtool_file *file) return 0; } -static int read_static_call_tramps(struct objtool_file *file) +static int classify_symbols(struct objtool_file *file) { struct section *sec; struct symbol *func; for_each_sec(file, sec) { list_for_each_entry(func, &sec->symbol_list, list) { - if (func->bind == STB_GLOBAL && - !strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR, + if (func->bind != STB_GLOBAL) + continue; + + if (!strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR, strlen(STATIC_CALL_TRAMP_PREFIX_STR))) func->static_call_tramp = true; + + if (arch_is_retpoline(func)) + func->retpoline_thunk = true; + + if (!strcmp(func->name, "__fentry__")) + func->fentry = true; + + if (!strncmp(func->name, "__sanitizer_cov_", 16)) + func->kcov = true; } } @@ -1780,17 +2041,16 @@ static void mark_rodata(struct objtool_file *file) file->rodata = found; } -__weak int arch_rewrite_retpolines(struct objtool_file *file) -{ - return 0; -} - static int decode_sections(struct objtool_file *file) { int ret; mark_rodata(file); + ret = init_pv_ops(file); + if (ret) + return ret; + ret = decode_instructions(file); if (ret) return ret; @@ -1809,7 +2069,7 @@ static int decode_sections(struct objtool_file *file) /* * Must be before add_{jump_call}_destination. */ - ret = read_static_call_tramps(file); + ret = classify_symbols(file); if (ret) return ret; @@ -1853,23 +2113,14 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - /* - * Must be after add_special_section_alts(), since this will emit - * alternatives. Must be after add_{jump,call}_destination(), since - * those create the call insn lists. - */ - ret = arch_rewrite_retpolines(file); - if (ret) - return ret; - return 0; } static bool is_fentry_call(struct instruction *insn) { - if (insn->type == INSN_CALL && insn->call_dest && - insn->call_dest->type == STT_NOTYPE && - !strcmp(insn->call_dest->name, "__fentry__")) + if (insn->type == INSN_CALL && + insn->call_dest && + insn->call_dest->fentry) return true; return false; @@ -2452,13 +2703,18 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn if (!insn->alt_group) return 0; + if (!insn->cfi) { + WARN("CFI missing"); + return -1; + } + alt_cfi = insn->alt_group->cfi; group_off = insn->offset - insn->alt_group->first_insn->offset; if (!alt_cfi[group_off]) { - alt_cfi[group_off] = &insn->cfi; + alt_cfi[group_off] = insn->cfi; } else { - if (memcmp(alt_cfi[group_off], &insn->cfi, sizeof(struct cfi_state))) { + if (cficmp(alt_cfi[group_off], insn->cfi)) { WARN_FUNC("stack layout conflict in alternatives", insn->sec, insn->offset); return -1; @@ -2509,9 +2765,14 @@ static int handle_insn_ops(struct instruction *insn, static bool insn_cfi_match(struct instruction *insn, struct cfi_state *cfi2) { - struct cfi_state *cfi1 = &insn->cfi; + struct cfi_state *cfi1 = insn->cfi; int i; + if (!cfi1) { + WARN("CFI missing"); + return false; + } + if (memcmp(&cfi1->cfa, &cfi2->cfa, sizeof(cfi1->cfa))) { WARN_FUNC("stack state mismatch: cfa1=%d%+d cfa2=%d%+d", @@ -2562,20 +2823,64 @@ static inline bool func_uaccess_safe(struct symbol *func) static inline const char *call_dest_name(struct instruction *insn) { + static char pvname[16]; + struct reloc *rel; + int idx; + if (insn->call_dest) return insn->call_dest->name; + rel = insn_reloc(NULL, insn); + if (rel && !strcmp(rel->sym->name, "pv_ops")) { + idx = (rel->addend / sizeof(void *)); + snprintf(pvname, sizeof(pvname), "pv_ops[%d]", idx); + return pvname; + } + return "{dynamic}"; } -static inline bool noinstr_call_dest(struct symbol *func) +static bool pv_call_dest(struct objtool_file *file, struct instruction *insn) +{ + struct symbol *target; + struct reloc *rel; + int idx; + + rel = insn_reloc(file, insn); + if (!rel || strcmp(rel->sym->name, "pv_ops")) + return false; + + idx = (arch_dest_reloc_offset(rel->addend) / sizeof(void *)); + + if (file->pv_ops[idx].clean) + return true; + + file->pv_ops[idx].clean = true; + + list_for_each_entry(target, &file->pv_ops[idx].targets, pv_target) { + if (!target->sec->noinstr) { + WARN("pv_ops[%d]: %s", idx, target->name); + file->pv_ops[idx].clean = false; + } + } + + return file->pv_ops[idx].clean; +} + +static inline bool noinstr_call_dest(struct objtool_file *file, + struct instruction *insn, + struct symbol *func) { /* * We can't deal with indirect function calls at present; * assume they're instrumented. */ - if (!func) + if (!func) { + if (file->pv_ops) + return pv_call_dest(file, insn); + return false; + } /* * If the symbol is from a noinstr section; we good. @@ -2594,10 +2899,12 @@ static inline bool noinstr_call_dest(struct symbol *func) return false; } -static int validate_call(struct instruction *insn, struct insn_state *state) +static int validate_call(struct objtool_file *file, + struct instruction *insn, + struct insn_state *state) { if (state->noinstr && state->instr <= 0 && - !noinstr_call_dest(insn->call_dest)) { + !noinstr_call_dest(file, insn, insn->call_dest)) { WARN_FUNC("call to %s() leaves .noinstr.text section", insn->sec, insn->offset, call_dest_name(insn)); return 1; @@ -2618,7 +2925,9 @@ static int validate_call(struct instruction *insn, struct insn_state *state) return 0; } -static int validate_sibling_call(struct instruction *insn, struct insn_state *state) +static int validate_sibling_call(struct objtool_file *file, + struct instruction *insn, + struct insn_state *state) { if (has_modified_stack_frame(insn, state)) { WARN_FUNC("sibling call from callable instruction with modified stack frame", @@ -2626,7 +2935,7 @@ static int validate_sibling_call(struct instruction *insn, struct insn_state *st return 1; } - return validate_call(insn, state); + return validate_call(file, insn, state); } static int validate_return(struct symbol *func, struct instruction *insn, struct insn_state *state) @@ -2696,7 +3005,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, struct instruction *insn, struct insn_state state) { struct alternative *alt; - struct instruction *next_insn; + struct instruction *next_insn, *prev_insn = NULL; struct section *sec; u8 visited; int ret; @@ -2725,15 +3034,25 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (insn->visited & visited) return 0; + } else { + nr_insns_visited++; } if (state.noinstr) state.instr += insn->instr; - if (insn->hint) - state.cfi = insn->cfi; - else - insn->cfi = state.cfi; + if (insn->hint) { + state.cfi = *insn->cfi; + } else { + /* XXX track if we actually changed state.cfi */ + + if (prev_insn && !cficmp(prev_insn->cfi, &state.cfi)) { + insn->cfi = prev_insn->cfi; + nr_cfi_reused++; + } else { + insn->cfi = cfi_hash_find_or_add(&state.cfi); + } + } insn->visited |= visited; @@ -2769,7 +3088,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_CALL: case INSN_CALL_DYNAMIC: - ret = validate_call(insn, &state); + ret = validate_call(file, insn, &state); if (ret) return ret; @@ -2788,7 +3107,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_JUMP_CONDITIONAL: case INSN_JUMP_UNCONDITIONAL: if (is_sibling_call(insn)) { - ret = validate_sibling_call(insn, &state); + ret = validate_sibling_call(file, insn, &state); if (ret) return ret; @@ -2810,7 +3129,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_JUMP_DYNAMIC: case INSN_JUMP_DYNAMIC_CONDITIONAL: if (is_sibling_call(insn)) { - ret = validate_sibling_call(insn, &state); + ret = validate_sibling_call(file, insn, &state); if (ret) return ret; } @@ -2883,6 +3202,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 1; } + prev_insn = insn; insn = next_insn; } @@ -3138,10 +3458,20 @@ int check(struct objtool_file *file) int ret, warnings = 0; arch_initial_func_cfi_state(&initial_func_cfi); + init_cfi_state(&init_cfi); + init_cfi_state(&func_cfi); + set_func_state(&func_cfi); + + if (!cfi_hash_alloc(1UL << (file->elf->symbol_bits - 3))) + goto out; + + cfi_hash_add(&init_cfi); + cfi_hash_add(&func_cfi); ret = decode_sections(file); if (ret < 0) goto out; + warnings += ret; if (list_empty(&file->insn_list)) @@ -3185,6 +3515,13 @@ int check(struct objtool_file *file) goto out; warnings += ret; + if (retpoline) { + ret = create_retpoline_sites_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } + if (mcount) { ret = create_mcount_loc_sections(file); if (ret < 0) @@ -3192,6 +3529,13 @@ int check(struct objtool_file *file) warnings += ret; } + if (stats) { + printf("nr_insns_visited: %ld\n", nr_insns_visited); + printf("nr_cfi: %ld\n", nr_cfi); + printf("nr_cfi_reused: %ld\n", nr_cfi_reused); + printf("nr_cfi_cache: %ld\n", nr_cfi_cache); + } + out: /* * For now, don't fail the kernel build on fatal warnings. These diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 8676c7598728..81a4c543ff7e 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -286,10 +286,9 @@ static int read_sections(struct elf *elf) return -1; } } - sec->len = sec->sh.sh_size; if (sec->sh.sh_flags & SHF_EXECINSTR) - elf->text_size += sec->len; + elf->text_size += sec->sh.sh_size; list_add_tail(&sec->list, &elf->sections); elf_hash_add(section, &sec->hash, sec->idx); @@ -509,6 +508,7 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, list_add_tail(&reloc->list, &sec->reloc->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); + sec->reloc->sh.sh_size += sec->reloc->sh.sh_entsize; sec->reloc->changed = true; return 0; @@ -734,97 +734,13 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str) data->d_size = strlen(str) + 1; data->d_align = 1; - len = strtab->len; - strtab->len += data->d_size; + len = strtab->sh.sh_size; + strtab->sh.sh_size += data->d_size; strtab->changed = true; return len; } -struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) -{ - struct section *symtab, *symtab_shndx; - struct symbol *sym; - Elf_Data *data; - Elf_Scn *s; - - sym = malloc(sizeof(*sym)); - if (!sym) { - perror("malloc"); - return NULL; - } - memset(sym, 0, sizeof(*sym)); - - sym->name = strdup(name); - - sym->sym.st_name = elf_add_string(elf, NULL, sym->name); - if (sym->sym.st_name == -1) - return NULL; - - sym->sym.st_info = GELF_ST_INFO(STB_GLOBAL, STT_NOTYPE); - // st_other 0 - // st_shndx 0 - // st_value 0 - // st_size 0 - - symtab = find_section_by_name(elf, ".symtab"); - if (!symtab) { - WARN("can't find .symtab"); - return NULL; - } - - s = elf_getscn(elf->elf, symtab->idx); - if (!s) { - WARN_ELF("elf_getscn"); - return NULL; - } - - data = elf_newdata(s); - if (!data) { - WARN_ELF("elf_newdata"); - return NULL; - } - - data->d_buf = &sym->sym; - data->d_size = sizeof(sym->sym); - data->d_align = 1; - data->d_type = ELF_T_SYM; - - sym->idx = symtab->len / sizeof(sym->sym); - - symtab->len += data->d_size; - symtab->changed = true; - - symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); - if (symtab_shndx) { - s = elf_getscn(elf->elf, symtab_shndx->idx); - if (!s) { - WARN_ELF("elf_getscn"); - return NULL; - } - - data = elf_newdata(s); - if (!data) { - WARN_ELF("elf_newdata"); - return NULL; - } - - data->d_buf = &sym->sym.st_size; /* conveniently 0 */ - data->d_size = sizeof(Elf32_Word); - data->d_align = 4; - data->d_type = ELF_T_WORD; - - symtab_shndx->len += 4; - symtab_shndx->changed = true; - } - - sym->sec = find_section_by_index(elf, 0); - - elf_add_symbol(elf, sym); - - return sym; -} - struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr) { @@ -855,7 +771,6 @@ struct section *elf_create_section(struct elf *elf, const char *name, } sec->idx = elf_ndxscn(s); - sec->len = size; sec->changed = true; sec->data = elf_newdata(s); @@ -979,63 +894,63 @@ static struct section *elf_create_reloc_section(struct elf *elf, } } -static int elf_rebuild_rel_reloc_section(struct section *sec, int nr) +static int elf_rebuild_rel_reloc_section(struct section *sec) { struct reloc *reloc; - int idx = 0, size; + int idx = 0; void *buf; /* Allocate a buffer for relocations */ - size = nr * sizeof(GElf_Rel); - buf = malloc(size); + buf = malloc(sec->sh.sh_size); if (!buf) { perror("malloc"); return -1; } sec->data->d_buf = buf; - sec->data->d_size = size; + sec->data->d_size = sec->sh.sh_size; sec->data->d_type = ELF_T_REL; - sec->sh.sh_size = size; - idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { reloc->rel.r_offset = reloc->offset; reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); - gelf_update_rel(sec->data, idx, &reloc->rel); + if (!gelf_update_rel(sec->data, idx, &reloc->rel)) { + WARN_ELF("gelf_update_rel"); + return -1; + } idx++; } return 0; } -static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) +static int elf_rebuild_rela_reloc_section(struct section *sec) { struct reloc *reloc; - int idx = 0, size; + int idx = 0; void *buf; /* Allocate a buffer for relocations with addends */ - size = nr * sizeof(GElf_Rela); - buf = malloc(size); + buf = malloc(sec->sh.sh_size); if (!buf) { perror("malloc"); return -1; } sec->data->d_buf = buf; - sec->data->d_size = size; + sec->data->d_size = sec->sh.sh_size; sec->data->d_type = ELF_T_RELA; - sec->sh.sh_size = size; - idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { reloc->rela.r_offset = reloc->offset; reloc->rela.r_addend = reloc->addend; reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); - gelf_update_rela(sec->data, idx, &reloc->rela); + if (!gelf_update_rela(sec->data, idx, &reloc->rela)) { + WARN_ELF("gelf_update_rela"); + return -1; + } idx++; } @@ -1044,16 +959,9 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) static int elf_rebuild_reloc_section(struct elf *elf, struct section *sec) { - struct reloc *reloc; - int nr; - - nr = 0; - list_for_each_entry(reloc, &sec->reloc_list, list) - nr++; - switch (sec->sh.sh_type) { - case SHT_REL: return elf_rebuild_rel_reloc_section(sec, nr); - case SHT_RELA: return elf_rebuild_rela_reloc_section(sec, nr); + case SHT_REL: return elf_rebuild_rel_reloc_section(sec); + case SHT_RELA: return elf_rebuild_rela_reloc_section(sec); default: return -1; } } @@ -1113,12 +1021,6 @@ int elf_write(struct elf *elf) /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->changed) { - if (sec->base && - elf_rebuild_reloc_section(elf, sec)) { - WARN("elf_rebuild_reloc_section"); - return -1; - } - s = elf_getscn(elf->elf, sec->idx); if (!s) { WARN_ELF("elf_getscn"); @@ -1129,6 +1031,12 @@ int elf_write(struct elf *elf) return -1; } + if (sec->base && + elf_rebuild_reloc_section(elf, sec)) { + WARN("elf_rebuild_reloc_section"); + return -1; + } + sec->changed = false; elf->changed = true; } diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 062bb6e9b865..589ff58426ab 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -69,7 +69,7 @@ struct instruction; void arch_initial_func_cfi_state(struct cfi_init_state *state); -int arch_decode_instruction(const struct elf *elf, const struct section *sec, +int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, unsigned long *immediate, @@ -82,8 +82,9 @@ unsigned long arch_jump_destination(struct instruction *insn); unsigned long arch_dest_reloc_offset(int addend); const char *arch_nop_insn(int len); +const char *arch_ret_insn(int len); -int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg); +int arch_decode_hint_reg(u8 sp_reg, int *base); bool arch_is_retpoline(struct symbol *sym); diff --git a/tools/objtool/include/objtool/cfi.h b/tools/objtool/include/objtool/cfi.h index fd5cb0bed9bf..f11d1ac1dadf 100644 --- a/tools/objtool/include/objtool/cfi.h +++ b/tools/objtool/include/objtool/cfi.h @@ -7,6 +7,7 @@ #define _OBJTOOL_CFI_H #include <arch/cfi_regs.h> +#include <linux/list.h> #define CFI_UNDEFINED -1 #define CFI_CFA -2 @@ -24,6 +25,7 @@ struct cfi_init_state { }; struct cfi_state { + struct hlist_node hash; /* must be first, cficmp() */ struct cfi_reg regs[CFI_NUM_REGS]; struct cfi_reg vals[CFI_NUM_REGS]; struct cfi_reg cfa; diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 56d50bc50c10..6cfff078897f 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -40,7 +40,6 @@ struct instruction { struct list_head list; struct hlist_node hash; struct list_head call_node; - struct list_head mcount_loc_node; struct section *sec; unsigned long offset; unsigned int len; @@ -60,7 +59,7 @@ struct instruction { struct list_head alts; struct symbol *func; struct list_head stack_ops; - struct cfi_state cfi; + struct cfi_state *cfi; }; static inline bool is_static_jump(struct instruction *insn) diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index e34395047530..cdc739fa9a6f 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -38,7 +38,6 @@ struct section { Elf_Data *data; char *name; int idx; - unsigned int len; bool changed, text, rodata, noinstr; }; @@ -55,8 +54,12 @@ struct symbol { unsigned long offset; unsigned int len; struct symbol *pfunc, *cfunc, *alias; - bool uaccess_safe; - bool static_call_tramp; + u8 uaccess_safe : 1; + u8 static_call_tramp : 1; + u8 retpoline_thunk : 1; + u8 fentry : 1; + u8 kcov : 1; + struct list_head pv_target; }; struct reloc { @@ -141,7 +144,6 @@ int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset, unsigned int len, const char *insn); int elf_write_reloc(struct elf *elf, struct reloc *reloc); -struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name); int elf_write(struct elf *elf); void elf_close(struct elf *elf); diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index 24fa83634de4..f99fbc6078d5 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -14,6 +14,11 @@ #define __weak __attribute__((weak)) +struct pv_state { + bool clean; + struct list_head targets; +}; + struct objtool_file { struct elf *elf; struct list_head insn_list; @@ -25,10 +30,14 @@ struct objtool_file { unsigned long jl_short, jl_long; unsigned long jl_nop_short, jl_nop_long; + + struct pv_state *pv_ops; }; struct objtool_file *objtool_open_read(const char *_objname); +void objtool_pv_add(struct objtool_file *file, int idx, struct symbol *func); + int check(struct objtool_file *file); int orc_dump(const char *objname); int orc_create(struct objtool_file *file); diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index e21db8bce493..c90c7084e45a 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -135,6 +135,28 @@ struct objtool_file *objtool_open_read(const char *_objname) return &file; } +void objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func) +{ + if (!noinstr) + return; + + if (!f->pv_ops) { + WARN("paravirt confusion"); + return; + } + + /* + * These functions will be patched into native code, + * see paravirt_patch(). + */ + if (!strcmp(func->name, "_paravirt_nop") || + !strcmp(func->name, "_paravirt_ident_64")) + return; + + list_add(&func->pv_target, &f->pv_ops[idx].targets); + f->pv_ops[idx].clean = false; +} + static void cmd_usage(void) { unsigned int i, longest = 0; diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index dc9b7dd314b0..dd3c64af9db2 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -13,13 +13,19 @@ #include <objtool/warn.h> #include <objtool/endianness.h> -static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi) +static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, + struct instruction *insn) { - struct instruction *insn = container_of(cfi, struct instruction, cfi); struct cfi_reg *bp = &cfi->regs[CFI_BP]; memset(orc, 0, sizeof(*orc)); + if (!cfi) { + orc->end = 0; + orc->sp_reg = ORC_REG_UNDEFINED; + return 0; + } + orc->end = cfi->end; if (cfi->cfa.base == CFI_UNDEFINED) { @@ -162,7 +168,7 @@ int orc_create(struct objtool_file *file) int i; if (!alt_group) { - if (init_orc_entry(&orc, &insn->cfi)) + if (init_orc_entry(&orc, insn->cfi, insn)) return -1; if (!memcmp(&prev_orc, &orc, sizeof(orc))) continue; @@ -186,7 +192,8 @@ int orc_create(struct objtool_file *file) struct cfi_state *cfi = alt_group->cfi[i]; if (!cfi) continue; - if (init_orc_entry(&orc, cfi)) + /* errors are reported on the original insn */ + if (init_orc_entry(&orc, cfi, insn)) return -1; if (!memcmp(&prev_orc, &orc, sizeof(orc))) continue; @@ -204,7 +211,7 @@ int orc_create(struct objtool_file *file) /* Add a section terminator */ if (!empty) { - orc_list_add(&orc_list, &null, sec, sec->len); + orc_list_add(&orc_list, &null, sec, sec->sh.sh_size); nr++; } } diff --git a/tools/objtool/special.c b/tools/objtool/special.c index f1428e32a505..e2223dd91c37 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -58,22 +58,11 @@ void __weak arch_handle_alternative(unsigned short feature, struct special_alt * { } -static bool reloc2sec_off(struct reloc *reloc, struct section **sec, unsigned long *off) +static void reloc_to_sec_off(struct reloc *reloc, struct section **sec, + unsigned long *off) { - switch (reloc->sym->type) { - case STT_FUNC: - *sec = reloc->sym->sec; - *off = reloc->sym->offset + reloc->addend; - return true; - - case STT_SECTION: - *sec = reloc->sym->sec; - *off = reloc->addend; - return true; - - default: - return false; - } + *sec = reloc->sym->sec; + *off = reloc->sym->offset + reloc->addend; } static int get_alt_entry(struct elf *elf, struct special_entry *entry, @@ -109,13 +98,8 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, WARN_FUNC("can't find orig reloc", sec, offset + entry->orig); return -1; } - if (!reloc2sec_off(orig_reloc, &alt->orig_sec, &alt->orig_off)) { - WARN_FUNC("don't know how to handle reloc symbol type %d: %s", - sec, offset + entry->orig, - orig_reloc->sym->type, - orig_reloc->sym->name); - return -1; - } + + reloc_to_sec_off(orig_reloc, &alt->orig_sec, &alt->orig_off); if (!entry->group || alt->new_len) { new_reloc = find_reloc_by_dest(elf, sec, offset + entry->new); @@ -125,21 +109,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, return -1; } - /* - * Skip retpoline .altinstr_replacement... we already rewrite the - * instructions for retpolines anyway, see arch_is_retpoline() - * usage in add_{call,jump}_destinations(). - */ - if (arch_is_retpoline(new_reloc->sym)) - return 1; - - if (!reloc2sec_off(new_reloc, &alt->new_sec, &alt->new_off)) { - WARN_FUNC("don't know how to handle reloc symbol type %d: %s", - sec, offset + entry->new, - new_reloc->sym->type, - new_reloc->sym->name); - return -1; - } + reloc_to_sec_off(new_reloc, &alt->new_sec, &alt->new_off); /* _ASM_EXTABLE_EX hack */ if (alt->new_off >= 0x7ffffff0) @@ -181,13 +151,13 @@ int special_get_alts(struct elf *elf, struct list_head *alts) if (!sec) continue; - if (sec->len % entry->size != 0) { + if (sec->sh.sh_size % entry->size != 0) { WARN("%s size not a multiple of %d", sec->name, entry->size); return -1; } - nr_entries = sec->len / entry->size; + nr_entries = sec->sh.sh_size / entry->size; for (idx = 0; idx < nr_entries; idx++) { alt = malloc(sizeof(*alt)); diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 446180401e26..14e3e8d702a0 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -143,7 +143,7 @@ FEATURE_CHECK_LDFLAGS-libcrypto = -lcrypto ifdef CSINCLUDES LIBOPENCSD_CFLAGS := -I$(CSINCLUDES) endif -OPENCSDLIBS := -lopencsd_c_api -lopencsd +OPENCSDLIBS := -lopencsd_c_api -lopencsd -lstdc++ ifdef CSLIBS LIBOPENCSD_LDFLAGS := -L$(CSLIBS) endif diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index e04313c4d840..b856afa6eb52 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -787,6 +787,8 @@ $(OUTPUT)dlfilters/%.o: dlfilters/%.c include/perf/perf_dlfilter.h $(Q)$(MKDIR) -p $(OUTPUT)dlfilters $(QUIET_CC)$(CC) -c -Iinclude $(EXTRA_CFLAGS) -o $@ -fpic $< +.SECONDARY: $(DLFILTERS:.so=.o) + $(OUTPUT)dlfilters/%.so: $(OUTPUT)dlfilters/%.o $(QUIET_LINK)$(CC) $(EXTRA_CFLAGS) -shared -o $@ $< @@ -802,7 +804,7 @@ endif $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h) -LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(LDFLAGS)' +LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(filter-out -static,$(LDFLAGS))' $(LIBTRACEEVENT): FORCE $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index 3018a054526a..20cd6244863b 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -45,7 +45,7 @@ static const Dwfl_Callbacks offline_callbacks = { */ static int check_return_reg(int ra_regno, Dwarf_Frame *frame) { - Dwarf_Op ops_mem[2]; + Dwarf_Op ops_mem[3]; Dwarf_Op dummy; Dwarf_Op *ops = &dummy; size_t nops; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 6211d0b84b7a..c32c2eb16d7d 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -459,7 +459,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) return -EINVAL; if (PRINT_FIELD(WEIGHT) && - evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT", PERF_OUTPUT_WEIGHT)) + evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT_TYPE, "WEIGHT", PERF_OUTPUT_WEIGHT)) return -EINVAL; if (PRINT_FIELD(SYM) && @@ -4039,11 +4039,15 @@ script_found: goto out_delete; uname(&uts); - if (data.is_pipe || /* assume pipe_mode indicates native_arch */ - !strcmp(uts.machine, session->header.env.arch) || - (!strcmp(uts.machine, "x86_64") && - !strcmp(session->header.env.arch, "i386"))) + if (data.is_pipe) { /* Assume pipe_mode indicates native_arch */ native_arch = true; + } else if (session->header.env.arch) { + if (!strcmp(uts.machine, session->header.env.arch)) + native_arch = true; + else if (!strcmp(uts.machine, "x86_64") && + !strcmp(session->header.env.arch, "i386")) + native_arch = true; + } script.session = session; script__setup_sample_type(&script); diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 6731b3cf0c2f..7c887d37b893 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -1285,6 +1285,7 @@ int main(int argc, char *argv[]) } free_arch_std_events(); + free_sys_event_tables(); free(mapfile); return 0; @@ -1306,6 +1307,7 @@ err_close_eventsfp: create_empty_mapping(output_file); err_out: free_arch_std_events(); + free_sys_event_tables(); free(mapfile); return ret; } diff --git a/tools/perf/tests/attr/test-stat-default b/tools/perf/tests/attr/test-stat-default index d9e99b3f77e6..d8ea6a88163f 100644 --- a/tools/perf/tests/attr/test-stat-default +++ b/tools/perf/tests/attr/test-stat-default @@ -68,3 +68,100 @@ fd=10 type=0 config=5 optional=1 + +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-1 b/tools/perf/tests/attr/test-stat-detailed-1 index 8b04a055d154..b656ab93c5bf 100644 --- a/tools/perf/tests/attr/test-stat-detailed-1 +++ b/tools/perf/tests/attr/test-stat-detailed-1 @@ -70,12 +70,109 @@ type=0 config=5 optional=1 +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event11:base-stat] -fd=11 +[event20:base-stat] +fd=20 type=3 config=0 optional=1 @@ -84,8 +181,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event12:base-stat] -fd=12 +[event21:base-stat] +fd=21 type=3 config=65536 optional=1 @@ -94,8 +191,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event13:base-stat] -fd=13 +[event22:base-stat] +fd=22 type=3 config=2 optional=1 @@ -104,8 +201,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event14:base-stat] -fd=14 +[event23:base-stat] +fd=23 type=3 config=65538 optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-2 b/tools/perf/tests/attr/test-stat-detailed-2 index 4fca9f1bfbf8..97625090a1c4 100644 --- a/tools/perf/tests/attr/test-stat-detailed-2 +++ b/tools/perf/tests/attr/test-stat-detailed-2 @@ -70,12 +70,109 @@ type=0 config=5 optional=1 +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event11:base-stat] -fd=11 +[event20:base-stat] +fd=20 type=3 config=0 optional=1 @@ -84,8 +181,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event12:base-stat] -fd=12 +[event21:base-stat] +fd=21 type=3 config=65536 optional=1 @@ -94,8 +191,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event13:base-stat] -fd=13 +[event22:base-stat] +fd=22 type=3 config=2 optional=1 @@ -104,8 +201,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event14:base-stat] -fd=14 +[event23:base-stat] +fd=23 type=3 config=65538 optional=1 @@ -114,8 +211,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event15:base-stat] -fd=15 +[event24:base-stat] +fd=24 type=3 config=1 optional=1 @@ -124,8 +221,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event16:base-stat] -fd=16 +[event25:base-stat] +fd=25 type=3 config=65537 optional=1 @@ -134,8 +231,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event17:base-stat] -fd=17 +[event26:base-stat] +fd=26 type=3 config=3 optional=1 @@ -144,8 +241,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event18:base-stat] -fd=18 +[event27:base-stat] +fd=27 type=3 config=65539 optional=1 @@ -154,8 +251,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event19:base-stat] -fd=19 +[event28:base-stat] +fd=28 type=3 config=4 optional=1 @@ -164,8 +261,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event20:base-stat] -fd=20 +[event29:base-stat] +fd=29 type=3 config=65540 optional=1 diff --git a/tools/perf/tests/attr/test-stat-detailed-3 b/tools/perf/tests/attr/test-stat-detailed-3 index 4bb58e1c82a6..d555042e3fbf 100644 --- a/tools/perf/tests/attr/test-stat-detailed-3 +++ b/tools/perf/tests/attr/test-stat-detailed-3 @@ -70,12 +70,109 @@ type=0 config=5 optional=1 +# PERF_TYPE_RAW / slots (0x400) +[event11:base-stat] +fd=11 +group_fd=-1 +type=4 +config=1024 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-retiring (0x8000) +[event12:base-stat] +fd=12 +group_fd=11 +type=4 +config=32768 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-bad-spec (0x8100) +[event13:base-stat] +fd=13 +group_fd=11 +type=4 +config=33024 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fe-bound (0x8200) +[event14:base-stat] +fd=14 +group_fd=11 +type=4 +config=33280 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-be-bound (0x8300) +[event15:base-stat] +fd=15 +group_fd=11 +type=4 +config=33536 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) +[event16:base-stat] +fd=16 +group_fd=11 +type=4 +config=33792 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) +[event17:base-stat] +fd=17 +group_fd=11 +type=4 +config=34048 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) +[event18:base-stat] +fd=18 +group_fd=11 +type=4 +config=34304 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + +# PERF_TYPE_RAW / topdown-mem-bound (0x8700) +[event19:base-stat] +fd=19 +group_fd=11 +type=4 +config=34560 +disabled=0 +enable_on_exec=0 +read_format=15 +optional=1 + # PERF_TYPE_HW_CACHE / # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event11:base-stat] -fd=11 +[event20:base-stat] +fd=20 type=3 config=0 optional=1 @@ -84,8 +181,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event12:base-stat] -fd=12 +[event21:base-stat] +fd=21 type=3 config=65536 optional=1 @@ -94,8 +191,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event13:base-stat] -fd=13 +[event22:base-stat] +fd=22 type=3 config=2 optional=1 @@ -104,8 +201,8 @@ optional=1 # PERF_COUNT_HW_CACHE_LL << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event14:base-stat] -fd=14 +[event23:base-stat] +fd=23 type=3 config=65538 optional=1 @@ -114,8 +211,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event15:base-stat] -fd=15 +[event24:base-stat] +fd=24 type=3 config=1 optional=1 @@ -124,8 +221,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1I << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event16:base-stat] -fd=16 +[event25:base-stat] +fd=25 type=3 config=65537 optional=1 @@ -134,8 +231,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event17:base-stat] -fd=17 +[event26:base-stat] +fd=26 type=3 config=3 optional=1 @@ -144,8 +241,8 @@ optional=1 # PERF_COUNT_HW_CACHE_DTLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event18:base-stat] -fd=18 +[event27:base-stat] +fd=27 type=3 config=65539 optional=1 @@ -154,8 +251,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event19:base-stat] -fd=19 +[event28:base-stat] +fd=28 type=3 config=4 optional=1 @@ -164,8 +261,8 @@ optional=1 # PERF_COUNT_HW_CACHE_ITLB << 0 | # (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event20:base-stat] -fd=20 +[event29:base-stat] +fd=29 type=3 config=65540 optional=1 @@ -174,8 +271,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) -[event21:base-stat] -fd=21 +[event30:base-stat] +fd=30 type=3 config=512 optional=1 @@ -184,8 +281,8 @@ optional=1 # PERF_COUNT_HW_CACHE_L1D << 0 | # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) -[event22:base-stat] -fd=22 +[event31:base-stat] +fd=31 type=3 config=66048 optional=1 diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index f0e75df72b80..3167b4628b6d 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -301,6 +301,16 @@ static const char * const mem_lvlnum[] = { [PERF_MEM_LVLNUM_NA] = "N/A", }; +static const char * const mem_hops[] = { + "N/A", + /* + * While printing, 'Remote' will be added to represent + * 'Remote core, same node' accesses as remote field need + * to be set with mem_hops field. + */ + "core, same node", +}; + int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) { size_t i, l = 0; @@ -320,12 +330,14 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) /* already taken care of */ m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS); - if (mem_info && mem_info->data_src.mem_remote) { strcat(out, "Remote "); l += 7; } + if (mem_info && mem_info->data_src.mem_hops) + l += scnprintf(out + l, sz - l, "%s ", mem_hops[mem_info->data_src.mem_hops]); + printed = 0; for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) { if (!(m & 0x1)) @@ -472,8 +484,12 @@ int c2c_decode_stats(struct c2c_stats *stats, struct mem_info *mi) /* * Skylake might report unknown remote level via this * bit, consider it when evaluating remote HITMs. + * + * Incase of power, remote field can also be used to denote cache + * accesses from the another core of same node. Hence, setting + * mrem only when HOPS is zero along with set remote field. */ - bool mrem = data_src->mem_remote; + bool mrem = (data_src->mem_remote && !data_src->mem_hops); int err = 0; #define HITM_INC(__f) \ diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 069c2cfdd3be..352f16076e01 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2116,7 +2116,7 @@ fetch_decomp_event(u64 head, size_t mmap_size, char *buf, bool needs_swap) static int __perf_session__process_decomp_events(struct perf_session *session) { s64 skip; - u64 size, file_pos = 0; + u64 size; struct decomp *decomp = session->decomp_last; if (!decomp) @@ -2132,7 +2132,7 @@ static int __perf_session__process_decomp_events(struct perf_session *session) size = event->header.size; if (size < sizeof(struct perf_event_header) || - (skip = perf_session__process_event(session, event, file_pos)) < 0) { + (skip = perf_session__process_event(session, event, decomp->file_pos)) < 0) { pr_err("%#" PRIx64 " [%#x]: failed to process type: %d\n", decomp->file_pos + decomp->head, event->header.size, event->header.type); return -EINVAL; diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 5a931456e718..ac35c61f65f5 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -16,7 +16,7 @@ assert sys.version_info >= (3, 7), "Python version is too old" from collections import namedtuple from enum import Enum, auto -from typing import Iterable +from typing import Iterable, Sequence import kunit_config import kunit_json @@ -186,6 +186,26 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree, exec_result.elapsed_time)) return parse_result +# Problem: +# $ kunit.py run --json +# works as one would expect and prints the parsed test results as JSON. +# $ kunit.py run --json suite_name +# would *not* pass suite_name as the filter_glob and print as json. +# argparse will consider it to be another way of writing +# $ kunit.py run --json=suite_name +# i.e. it would run all tests, and dump the json to a `suite_name` file. +# So we hackily automatically rewrite --json => --json=stdout +pseudo_bool_flag_defaults = { + '--json': 'stdout', + '--raw_output': 'kunit', +} +def massage_argv(argv: Sequence[str]) -> Sequence[str]: + def massage_arg(arg: str) -> str: + if arg not in pseudo_bool_flag_defaults: + return arg + return f'{arg}={pseudo_bool_flag_defaults[arg]}' + return list(map(massage_arg, argv)) + def add_common_opts(parser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' @@ -303,7 +323,7 @@ def main(argv, linux=None): help='Specifies the file to read results from.', type=str, nargs='?', metavar='input_file') - cli_args = parser.parse_args(argv) + cli_args = parser.parse_args(massage_argv(argv)) if get_kernel_root_path(): os.chdir(get_kernel_root_path()) diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 619c4554cbff..1edcc8373b4e 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -408,6 +408,14 @@ class KUnitMainTest(unittest.TestCase): self.assertNotEqual(call, mock.call(StrContains('Testing complete.'))) self.assertNotEqual(call, mock.call(StrContains(' 0 tests run'))) + def test_run_raw_output_does_not_take_positional_args(self): + # --raw_output is a string flag, but we don't want it to consume + # any positional arguments, only ones after an '=' + self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) + kunit.main(['run', '--raw_output', 'filter_glob'], self.linux_source_mock) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir='.kunit', filter_glob='filter_glob', timeout=300) + def test_exec_timeout(self): timeout = 3453 kunit.main(['exec', '--timeout', str(timeout)], self.linux_source_mock) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 5c5979046523..d88bb65b74cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -949,7 +949,6 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, int err, n; u32 key; char b; - int retries = 100; zero_verdict_count(verd_mapfd); @@ -1002,17 +1001,11 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, goto close_peer1; if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); -again: - n = read(c0, &b, 1); - if (n < 0) { - if (errno == EAGAIN && retries--) { - usleep(1000); - goto again; - } - FAIL_ERRNO("%s: read", log_prefix); - } + n = recv_timeout(c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) - FAIL("%s: incomplete read", log_prefix); + FAIL("%s: incomplete recv", log_prefix); close_peer1: xclose(p1); @@ -1571,7 +1564,6 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd, const char *log_prefix = redir_mode_str(mode); int c0, c1, p0, p1; unsigned int pass; - int retries = 100; int err, n; int sfd[2]; u32 key; @@ -1606,17 +1598,11 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); -again: - n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); - if (n < 0) { - if (errno == EAGAIN && retries--) { - usleep(1000); - goto again; - } - FAIL_ERRNO("%s: read", log_prefix); - } + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) - FAIL("%s: incomplete read", log_prefix); + FAIL("%s: incomplete recv", log_prefix); close: xclose(c1); @@ -1748,7 +1734,6 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, const char *log_prefix = redir_mode_str(mode); int c0, c1, p0, p1; unsigned int pass; - int retries = 100; int err, n; u32 key; char b; @@ -1781,17 +1766,11 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); -again: - n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); - if (n < 0) { - if (errno == EAGAIN && retries--) { - usleep(1000); - goto again; - } - FAIL_ERRNO("%s: read", log_prefix); - } + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) - FAIL("%s: incomplete read", log_prefix); + FAIL("%s: incomplete recv", log_prefix); close_cli1: xclose(c1); @@ -1841,7 +1820,6 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd, const char *log_prefix = redir_mode_str(mode); int c0, c1, p0, p1; unsigned int pass; - int retries = 100; int err, n; int sfd[2]; u32 key; @@ -1876,17 +1854,11 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); -again: - n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); - if (n < 0) { - if (errno == EAGAIN && retries--) { - usleep(1000); - goto again; - } - FAIL_ERRNO("%s: read", log_prefix); - } + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) - FAIL("%s: incomplete read", log_prefix); + FAIL("%s: incomplete recv", log_prefix); close_cli1: xclose(c1); @@ -1932,7 +1904,6 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, int sfd[2]; u32 key; char b; - int retries = 100; zero_verdict_count(verd_mapfd); @@ -1963,17 +1934,11 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); -again: - n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); - if (n < 0) { - if (errno == EAGAIN && retries--) { - usleep(1000); - goto again; - } - FAIL_ERRNO("%s: read", log_prefix); - } + n = recv_timeout(mode == REDIR_INGRESS ? p0 : c0, &b, 1, 0, IO_TIMEOUT_SEC); + if (n < 0) + FAIL_ERRNO("%s: recv_timeout", log_prefix); if (n == 0) - FAIL("%s: incomplete read", log_prefix); + FAIL("%s: incomplete recv", log_prefix); close: xclose(c1); diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c index 4de902ea14d8..de1c4e6de0b2 100644 --- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c +++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <errno.h> -#include <linux/fcntl.h> +#include <fcntl.h> #include <malloc.h> #include <sys/ioctl.h> diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc index 5f5b2ba3e557..60c02b482be8 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc @@ -11,8 +11,8 @@ SYSTEM="syscalls" EVENT="sys_enter_openat" FIELD="filename" EPROBE="eprobe_open" - -echo "e:$EPROBE $SYSTEM/$EVENT file=+0(\$filename):ustring" >> dynamic_events +OPTIONS="file=+0(\$filename):ustring" +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events grep -q "$EPROBE" dynamic_events test -d events/eprobes/$EPROBE @@ -37,4 +37,54 @@ echo "-:$EPROBE" >> dynamic_events ! grep -q "$EPROBE" dynamic_events ! test -d events/eprobes/$EPROBE +# test various ways to remove the probe (already tested with just event name) + +# With group name +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With group name and system/event +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With just event name and system/event +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:$EPROBE $SYSTEM/$EVENT" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With just event name and system/event and options +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# With group name and system/event and options +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +# Finally make sure what is in the dynamic_events file clears it too +echo "e:$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +LINE=`sed -e '/$EPROBE/s/^e/-/' < dynamic_events` +test -d events/eprobes/$EPROBE +echo "-:eprobes/$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + clear_trace diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore index 0e78b49d0f2f..fbcbdb6963b3 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -8,3 +8,4 @@ futex_wait_uninitialized_heap futex_wait_wouldblock futex_wait futex_requeue +futex_waitv diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index bd1fec59e010..5cc38de9d8ea 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -17,7 +17,8 @@ TEST_GEN_FILES := \ futex_wait_uninitialized_heap \ futex_wait_private_mapped_file \ futex_wait \ - futex_requeue + futex_requeue \ + futex_waitv TEST_PROGS := run.sh diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index 1f8f6daaf1e7..3651ce17beeb 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -17,6 +17,7 @@ #include <pthread.h> #include "futextest.h" +#include "futex2test.h" #include "logging.h" #define TEST_NAME "futex-wait-timeout" @@ -96,6 +97,12 @@ int main(int argc, char *argv[]) struct timespec to; pthread_t thread; int c; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1, + .flags = FUTEX_32, + .__reserved = 0 + }; while ((c = getopt(argc, argv, "cht:v:")) != -1) { switch (c) { @@ -118,7 +125,7 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(7); + ksft_set_plan(9); ksft_print_msg("%s: Block on a futex and wait for timeout\n", basename(argv[0])); ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); @@ -175,6 +182,18 @@ int main(int argc, char *argv[]) res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); + /* futex_waitv with CLOCK_MONOTONIC */ + if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) + return RET_FAIL; + res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); + test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT); + + /* futex_waitv with CLOCK_REALTIME */ + if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) + return RET_FAIL; + res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME); + test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT); + ksft_print_cnts(); return ret; } diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 0ae390ff8164..7d7a6a06cdb7 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -22,6 +22,7 @@ #include <string.h> #include <time.h> #include "futextest.h" +#include "futex2test.h" #include "logging.h" #define TEST_NAME "futex-wait-wouldblock" @@ -42,6 +43,12 @@ int main(int argc, char *argv[]) futex_t f1 = FUTEX_INITIALIZER; int res, ret = RET_PASS; int c; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1+1, + .flags = FUTEX_32, + .__reserved = 0 + }; while ((c = getopt(argc, argv, "cht:v:")) != -1) { switch (c) { @@ -61,18 +68,44 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(2); ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", basename(argv[0])); info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); if (!res || errno != EWOULDBLOCK) { - fail("futex_wait returned: %d %s\n", - res ? errno : res, res ? strerror(errno) : ""); + ksft_test_result_fail("futex_wait returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_wait\n"); } - print_result(TEST_NAME, ret); + if (clock_gettime(CLOCK_MONOTONIC, &to)) { + error("clock_gettime failed\n", errno); + return errno; + } + + to.tv_nsec += timeout_ns; + + if (to.tv_nsec >= 1000000000) { + to.tv_sec++; + to.tv_nsec -= 1000000000; + } + + info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); + if (!res || errno != EWOULDBLOCK) { + ksft_test_result_pass("futex_waitv returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv\n"); + } + + ksft_print_cnts(); return ret; } diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c new file mode 100644 index 000000000000..a94337f677e1 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_waitv.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * futex_waitv() test by AndrĂ© Almeida <andrealmeid@collabora.com> + * + * Copyright 2021 Collabora Ltd. + */ + +#include <errno.h> +#include <error.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <pthread.h> +#include <stdint.h> +#include <sys/shm.h> +#include "futextest.h" +#include "futex2test.h" +#include "logging.h" + +#define TEST_NAME "futex-wait" +#define WAKE_WAIT_US 10000 +#define NR_FUTEXES 30 +static struct futex_waitv waitv[NR_FUTEXES]; +u_int32_t futexes[NR_FUTEXES] = {0}; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +void *waiterfn(void *arg) +{ + struct timespec to; + int res; + + /* setting absolute timeout for futex2 */ + if (clock_gettime(CLOCK_MONOTONIC, &to)) + error("gettime64 failed\n", errno); + + to.tv_sec++; + + res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); + if (res < 0) { + ksft_test_result_fail("futex_waitv returned: %d %s\n", + errno, strerror(errno)); + } else if (res != NR_FUTEXES - 1) { + ksft_test_result_fail("futex_waitv returned: %d, expecting %d\n", + res, NR_FUTEXES - 1); + } + + return NULL; +} + +int main(int argc, char *argv[]) +{ + pthread_t waiter; + int res, ret = RET_PASS; + struct timespec to; + int c, i; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + ksft_print_header(); + ksft_set_plan(7); + ksft_print_msg("%s: Test FUTEX_WAITV\n", + basename(argv[0])); + + for (i = 0; i < NR_FUTEXES; i++) { + waitv[i].uaddr = (uintptr_t)&futexes[i]; + waitv[i].flags = FUTEX_32 | FUTEX_PRIVATE_FLAG; + waitv[i].val = 0; + waitv[i].__reserved = 0; + } + + /* Private waitv */ + if (pthread_create(&waiter, NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + + res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, FUTEX_PRIVATE_FLAG); + if (res != 1) { + ksft_test_result_fail("futex_wake private returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv private\n"); + } + + /* Shared waitv */ + for (i = 0; i < NR_FUTEXES; i++) { + int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); + + if (shm_id < 0) { + perror("shmget"); + exit(1); + } + + unsigned int *shared_data = shmat(shm_id, NULL, 0); + + *shared_data = 0; + waitv[i].uaddr = (uintptr_t)shared_data; + waitv[i].flags = FUTEX_32; + waitv[i].val = 0; + waitv[i].__reserved = 0; + } + + if (pthread_create(&waiter, NULL, waiterfn, NULL)) + error("pthread_create failed\n", errno); + + usleep(WAKE_WAIT_US); + + res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, 0); + if (res != 1) { + ksft_test_result_fail("futex_wake shared returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv shared\n"); + } + + for (i = 0; i < NR_FUTEXES; i++) + shmdt(u64_to_ptr(waitv[i].uaddr)); + + /* Testing a waiter without FUTEX_32 flag */ + waitv[0].flags = FUTEX_PRIVATE_FLAG; + + if (clock_gettime(CLOCK_MONOTONIC, &to)) + error("gettime64 failed\n", errno); + + to.tv_sec++; + + res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); + if (res == EINVAL) { + ksft_test_result_fail("futex_waitv private returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv without FUTEX_32\n"); + } + + /* Testing a waiter with an unaligned address */ + waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32; + waitv[0].uaddr = 1; + + if (clock_gettime(CLOCK_MONOTONIC, &to)) + error("gettime64 failed\n", errno); + + to.tv_sec++; + + res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); + if (res == EINVAL) { + ksft_test_result_fail("futex_wake private returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv with an unaligned address\n"); + } + + /* Testing a NULL address for waiters.uaddr */ + waitv[0].uaddr = 0x00000000; + + if (clock_gettime(CLOCK_MONOTONIC, &to)) + error("gettime64 failed\n", errno); + + to.tv_sec++; + + res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); + if (res == EINVAL) { + ksft_test_result_fail("futex_waitv private returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n"); + } + + /* Testing a NULL address for *waiters */ + if (clock_gettime(CLOCK_MONOTONIC, &to)) + error("gettime64 failed\n", errno); + + to.tv_sec++; + + res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); + if (res == EINVAL) { + ksft_test_result_fail("futex_waitv private returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv NULL address in *waiters\n"); + } + + /* Testing an invalid clockid */ + if (clock_gettime(CLOCK_MONOTONIC, &to)) + error("gettime64 failed\n", errno); + + to.tv_sec++; + + res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_TAI); + if (res == EINVAL) { + ksft_test_result_fail("futex_waitv private returned: %d %s\n", + res ? errno : res, + res ? strerror(errno) : ""); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_waitv invalid clockid\n"); + } + + ksft_print_cnts(); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 11a9d62290f5..5ccd599da6c3 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -79,3 +79,6 @@ echo echo ./futex_requeue $COLOR + +echo +./futex_waitv $COLOR diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h new file mode 100644 index 000000000000..9d305520e849 --- /dev/null +++ b/tools/testing/selftests/futex/include/futex2test.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Futex2 library addons for futex tests + * + * Copyright 2021 Collabora Ltd. + */ +#include <stdint.h> + +#define u64_to_ptr(x) ((void *)(uintptr_t)(x)) + +/** + * futex_waitv - Wait at multiple futexes, wake on any + * @waiters: Array of waiters + * @nr_waiters: Length of waiters array + * @flags: Operation flags + * @timo: Optional timeout for operation + */ +static inline int futex_waitv(volatile struct futex_waitv *waiters, unsigned long nr_waiters, + unsigned long flags, struct timespec *timo, clockid_t clockid) +{ + return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid); +} diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 21b646d10b88..86ab429fe7f3 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -43,3 +43,4 @@ CONFIG_NET_ACT_TUNNEL_KEY=m CONFIG_NET_ACT_MIRRED=m CONFIG_BAREUDP=m CONFIG_IPV6_IOAM6_LWTUNNEL=y +CONFIG_CRYPTO_SM4=y diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 13350cd5c8ac..3313566ce906 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -289,6 +289,12 @@ set_sysctl() run_cmd sysctl -q -w $* } +# get sysctl values in NS-A +get_sysctl() +{ + ${NSA_CMD} sysctl -n $* +} + ################################################################################ # Setup for tests @@ -439,10 +445,13 @@ cleanup() ip -netns ${NSA} link set dev ${NSA_DEV} down ip -netns ${NSA} link del dev ${NSA_DEV} + ip netns pids ${NSA} | xargs kill 2>/dev/null ip netns del ${NSA} fi + ip netns pids ${NSB} | xargs kill 2>/dev/null ip netns del ${NSB} + ip netns pids ${NSC} | xargs kill 2>/dev/null ip netns del ${NSC} >/dev/null 2>&1 } @@ -1003,6 +1012,60 @@ ipv4_tcp_md5() run_cmd nettest -s -I ${NSA_DEV} -M ${MD5_PW} -m ${NS_NET} log_test $? 1 "MD5: VRF: Device must be a VRF - prefix" + test_ipv4_md5_vrf__vrf_server__no_bind_ifindex + test_ipv4_md5_vrf__global_server__bind_ifindex0 +} + +test_ipv4_md5_vrf__vrf_server__no_bind_ifindex() +{ + log_start + show_hint "Simulates applications using VRF without TCP_MD5SIG_FLAG_IFINDEX" + run_cmd nettest -s -I ${VRF} -M ${MD5_PW} -m ${NS_NET} --no-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: VRF-bound server, unbound key accepts connection" + + log_start + show_hint "Binding both the socket and the key is not required but it works" + run_cmd nettest -s -I ${VRF} -M ${MD5_PW} -m ${NS_NET} --force-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: VRF-bound server, bound key accepts connection" +} + +test_ipv4_md5_vrf__global_server__bind_ifindex0() +{ + # This particular test needs tcp_l3mdev_accept=1 for Global server to accept VRF connections + local old_tcp_l3mdev_accept + old_tcp_l3mdev_accept=$(get_sysctl net.ipv4.tcp_l3mdev_accept) + set_sysctl net.ipv4.tcp_l3mdev_accept=1 + + log_start + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --force-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 2 "MD5: VRF: Global server, Key bound to ifindex=0 rejects VRF connection" + + log_start + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --force-bind-key-ifindex & + sleep 1 + run_cmd_nsc nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: Global server, key bound to ifindex=0 accepts non-VRF connection" + log_start + + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --no-bind-key-ifindex & + sleep 1 + run_cmd_nsb nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: Global server, key not bound to ifindex accepts VRF connection" + + log_start + run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} --no-bind-key-ifindex & + sleep 1 + run_cmd_nsc nettest -r ${NSA_IP} -X ${MD5_PW} + log_test $? 0 "MD5: VRF: Global server, key not bound to ifindex accepts non-VRF connection" + + # restore value + set_sysctl net.ipv4.tcp_l3mdev_accept="$old_tcp_l3mdev_accept" } ipv4_tcp_novrf() diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index d97bd6889446..72ee644d47bf 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -9,6 +9,7 @@ TEST_PROGS = bridge_igmp.sh \ gre_inner_v4_multipath.sh \ gre_inner_v6_multipath.sh \ gre_multipath.sh \ + ip6_forward_instats_vrf.sh \ ip6gre_inner_v4_multipath.sh \ ip6gre_inner_v6_multipath.sh \ ipip_flat_gre_key.sh \ diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample index b802c14d2950..e5e2fbeca22e 100644 --- a/tools/testing/selftests/net/forwarding/forwarding.config.sample +++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample @@ -39,3 +39,5 @@ NETIF_CREATE=yes # Timeout (in seconds) before ping exits regardless of how many packets have # been sent or received PING_TIMEOUT=5 +# IPv6 traceroute utility name. +TROUTE6=traceroute6 diff --git a/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh new file mode 100755 index 000000000000..9f5b3e2e5e95 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test ipv6 stats on the incoming if when forwarding with VRF + +ALL_TESTS=" + ipv6_ping + ipv6_in_too_big_err + ipv6_in_hdr_err + ipv6_in_addr_err + ipv6_in_discard +" + +NUM_NETIFS=4 +source lib.sh + +h1_create() +{ + simple_if_init $h1 2001:1:1::2/64 + ip -6 route add vrf v$h1 2001:1:2::/64 via 2001:1:1::1 +} + +h1_destroy() +{ + ip -6 route del vrf v$h1 2001:1:2::/64 via 2001:1:1::1 + simple_if_fini $h1 2001:1:1::2/64 +} + +router_create() +{ + vrf_create router + __simple_if_init $rtr1 router 2001:1:1::1/64 + __simple_if_init $rtr2 router 2001:1:2::1/64 + mtu_set $rtr2 1280 +} + +router_destroy() +{ + mtu_restore $rtr2 + __simple_if_fini $rtr2 2001:1:2::1/64 + __simple_if_fini $rtr1 2001:1:1::1/64 + vrf_destroy router +} + +h2_create() +{ + simple_if_init $h2 2001:1:2::2/64 + ip -6 route add vrf v$h2 2001:1:1::/64 via 2001:1:2::1 + mtu_set $h2 1280 +} + +h2_destroy() +{ + mtu_restore $h2 + ip -6 route del vrf v$h2 2001:1:1::/64 via 2001:1:2::1 + simple_if_fini $h2 2001:1:2::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rtr1=${NETIFS[p2]} + + rtr2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + h1_create + router_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + router_destroy + h1_destroy + vrf_cleanup +} + +ipv6_in_too_big_err() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors) + local vrf_name=$(master_name_get $h1) + + # Send too big packets + ip vrf exec $vrf_name \ + $PING6 -s 1300 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null + + local t1=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InTooBigErrors" +} + +ipv6_in_hdr_err() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InHdrErrors) + local vrf_name=$(master_name_get $h1) + + # Send packets with hop limit 1, easiest with traceroute6 as some ping6 + # doesn't allow hop limit to be specified + ip vrf exec $vrf_name \ + $TROUTE6 2001:1:2::2 &> /dev/null + + local t1=$(ipv6_stats_get $rtr1 Ip6InHdrErrors) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InHdrErrors" +} + +ipv6_in_addr_err() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InAddrErrors) + local vrf_name=$(master_name_get $h1) + + # Disable forwarding temporary while sending the packet + sysctl -qw net.ipv6.conf.all.forwarding=0 + ip vrf exec $vrf_name \ + $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null + sysctl -qw net.ipv6.conf.all.forwarding=1 + + local t1=$(ipv6_stats_get $rtr1 Ip6InAddrErrors) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InAddrErrors" +} + +ipv6_in_discard() +{ + RET=0 + + local t0=$(ipv6_stats_get $rtr1 Ip6InDiscards) + local vrf_name=$(master_name_get $h1) + + # Add a policy to discard + ip xfrm policy add dst 2001:1:2::2/128 dir fwd action block + ip vrf exec $vrf_name \ + $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null + ip xfrm policy del dst 2001:1:2::2/128 dir fwd + + local t1=$(ipv6_stats_get $rtr1 Ip6InDiscards) + test "$((t1 - t0))" -ne 0 + check_err $? + log_test "Ip6InDiscards" +} +ipv6_ping() +{ + RET=0 + + ping6_test $h1 2001:1:2::2 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index e7fc5c35b569..92087d423bcf 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -751,6 +751,14 @@ qdisc_parent_stats_get() | jq '.[] | select(.parent == "'"$parent"'") | '"$selector" } +ipv6_stats_get() +{ + local dev=$1; shift + local stat=$1; shift + + cat /proc/net/dev_snmp6/$dev | grep "^$stat" | cut -f2 +} + humanize() { local speed=$1; shift diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index 3caf72bb9c6a..a2489ec398fe 100755 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -468,10 +468,26 @@ out_bits() for i in {0..22} do ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ - prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} dev veth0 - - run_test "out_bit$i" "${desc/<n>/$i}" ioam-node-alpha ioam-node-beta \ - db01::2 db01::1 veth0 ${bit2type[$i]} 123 + prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} \ + dev veth0 &>/dev/null + + local cmd_res=$? + local descr="${desc/<n>/$i}" + + if [[ $i -ge 12 && $i -le 21 ]] + then + if [ $cmd_res != 0 ] + then + npassed=$((npassed+1)) + log_test_passed "$descr" + else + nfailed=$((nfailed+1)) + log_test_failed "$descr" + fi + else + run_test "out_bit$i" "$descr" ioam-node-alpha ioam-node-beta \ + db01::2 db01::1 veth0 ${bit2type[$i]} 123 + fi done bit2size[22]=$tmp @@ -544,7 +560,7 @@ in_bits() local tmp=${bit2size[22]} bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) )) - for i in {0..22} + for i in {0..11} {22..22} do ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} dev veth0 diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c index d376cb2c383c..8f6997d35816 100644 --- a/tools/testing/selftests/net/ioam6_parser.c +++ b/tools/testing/selftests/net/ioam6_parser.c @@ -94,16 +94,6 @@ enum { TEST_OUT_BIT9, TEST_OUT_BIT10, TEST_OUT_BIT11, - TEST_OUT_BIT12, - TEST_OUT_BIT13, - TEST_OUT_BIT14, - TEST_OUT_BIT15, - TEST_OUT_BIT16, - TEST_OUT_BIT17, - TEST_OUT_BIT18, - TEST_OUT_BIT19, - TEST_OUT_BIT20, - TEST_OUT_BIT21, TEST_OUT_BIT22, TEST_OUT_FULL_SUPP_TRACE, @@ -125,16 +115,6 @@ enum { TEST_IN_BIT9, TEST_IN_BIT10, TEST_IN_BIT11, - TEST_IN_BIT12, - TEST_IN_BIT13, - TEST_IN_BIT14, - TEST_IN_BIT15, - TEST_IN_BIT16, - TEST_IN_BIT17, - TEST_IN_BIT18, - TEST_IN_BIT19, - TEST_IN_BIT20, - TEST_IN_BIT21, TEST_IN_BIT22, TEST_IN_FULL_SUPP_TRACE, @@ -199,30 +179,6 @@ static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h, ioam6h->nodelen != 2 || ioam6h->remlen; - case TEST_OUT_BIT12: - case TEST_IN_BIT12: - case TEST_OUT_BIT13: - case TEST_IN_BIT13: - case TEST_OUT_BIT14: - case TEST_IN_BIT14: - case TEST_OUT_BIT15: - case TEST_IN_BIT15: - case TEST_OUT_BIT16: - case TEST_IN_BIT16: - case TEST_OUT_BIT17: - case TEST_IN_BIT17: - case TEST_OUT_BIT18: - case TEST_IN_BIT18: - case TEST_OUT_BIT19: - case TEST_IN_BIT19: - case TEST_OUT_BIT20: - case TEST_IN_BIT20: - case TEST_OUT_BIT21: - case TEST_IN_BIT21: - return ioam6h->overflow || - ioam6h->nodelen || - ioam6h->remlen != 1; - case TEST_OUT_BIT22: case TEST_IN_BIT22: return ioam6h->overflow || @@ -326,6 +282,66 @@ static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h, *p += sizeof(__u32); } + if (ioam6h->type.bit12) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit13) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit14) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit15) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit16) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit17) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit18) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit19) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit20) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit21) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + if (ioam6h->type.bit22) { len = cnf.sc_data ? strlen(cnf.sc_data) : 0; aligned = cnf.sc_data ? __ALIGN_KERNEL(len, 4) : 0; @@ -455,26 +471,6 @@ static int str2id(const char *tname) return TEST_OUT_BIT10; if (!strcmp("out_bit11", tname)) return TEST_OUT_BIT11; - if (!strcmp("out_bit12", tname)) - return TEST_OUT_BIT12; - if (!strcmp("out_bit13", tname)) - return TEST_OUT_BIT13; - if (!strcmp("out_bit14", tname)) - return TEST_OUT_BIT14; - if (!strcmp("out_bit15", tname)) - return TEST_OUT_BIT15; - if (!strcmp("out_bit16", tname)) - return TEST_OUT_BIT16; - if (!strcmp("out_bit17", tname)) - return TEST_OUT_BIT17; - if (!strcmp("out_bit18", tname)) - return TEST_OUT_BIT18; - if (!strcmp("out_bit19", tname)) - return TEST_OUT_BIT19; - if (!strcmp("out_bit20", tname)) - return TEST_OUT_BIT20; - if (!strcmp("out_bit21", tname)) - return TEST_OUT_BIT21; if (!strcmp("out_bit22", tname)) return TEST_OUT_BIT22; if (!strcmp("out_full_supp_trace", tname)) @@ -509,26 +505,6 @@ static int str2id(const char *tname) return TEST_IN_BIT10; if (!strcmp("in_bit11", tname)) return TEST_IN_BIT11; - if (!strcmp("in_bit12", tname)) - return TEST_IN_BIT12; - if (!strcmp("in_bit13", tname)) - return TEST_IN_BIT13; - if (!strcmp("in_bit14", tname)) - return TEST_IN_BIT14; - if (!strcmp("in_bit15", tname)) - return TEST_IN_BIT15; - if (!strcmp("in_bit16", tname)) - return TEST_IN_BIT16; - if (!strcmp("in_bit17", tname)) - return TEST_IN_BIT17; - if (!strcmp("in_bit18", tname)) - return TEST_IN_BIT18; - if (!strcmp("in_bit19", tname)) - return TEST_IN_BIT19; - if (!strcmp("in_bit20", tname)) - return TEST_IN_BIT20; - if (!strcmp("in_bit21", tname)) - return TEST_IN_BIT21; if (!strcmp("in_bit22", tname)) return TEST_IN_BIT22; if (!strcmp("in_full_supp_trace", tname)) @@ -606,16 +582,6 @@ static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = { [TEST_OUT_BIT9] = check_ioam_header_and_data, [TEST_OUT_BIT10] = check_ioam_header_and_data, [TEST_OUT_BIT11] = check_ioam_header_and_data, - [TEST_OUT_BIT12] = check_ioam_header, - [TEST_OUT_BIT13] = check_ioam_header, - [TEST_OUT_BIT14] = check_ioam_header, - [TEST_OUT_BIT15] = check_ioam_header, - [TEST_OUT_BIT16] = check_ioam_header, - [TEST_OUT_BIT17] = check_ioam_header, - [TEST_OUT_BIT18] = check_ioam_header, - [TEST_OUT_BIT19] = check_ioam_header, - [TEST_OUT_BIT20] = check_ioam_header, - [TEST_OUT_BIT21] = check_ioam_header, [TEST_OUT_BIT22] = check_ioam_header_and_data, [TEST_OUT_FULL_SUPP_TRACE] = check_ioam_header_and_data, [TEST_IN_UNDEF_NS] = check_ioam_header, @@ -633,16 +599,6 @@ static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = { [TEST_IN_BIT9] = check_ioam_header_and_data, [TEST_IN_BIT10] = check_ioam_header_and_data, [TEST_IN_BIT11] = check_ioam_header_and_data, - [TEST_IN_BIT12] = check_ioam_header, - [TEST_IN_BIT13] = check_ioam_header, - [TEST_IN_BIT14] = check_ioam_header, - [TEST_IN_BIT15] = check_ioam_header, - [TEST_IN_BIT16] = check_ioam_header, - [TEST_IN_BIT17] = check_ioam_header, - [TEST_IN_BIT18] = check_ioam_header, - [TEST_IN_BIT19] = check_ioam_header, - [TEST_IN_BIT20] = check_ioam_header, - [TEST_IN_BIT21] = check_ioam_header, [TEST_IN_BIT22] = check_ioam_header_and_data, [TEST_IN_FULL_SUPP_TRACE] = check_ioam_header_and_data, [TEST_FWD_FULL_SUPP_TRACE] = check_ioam_header_and_data, diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c index bd6288302094..b599003eb5ba 100644 --- a/tools/testing/selftests/net/nettest.c +++ b/tools/testing/selftests/net/nettest.c @@ -28,6 +28,7 @@ #include <unistd.h> #include <time.h> #include <errno.h> +#include <getopt.h> #include <linux/xfrm.h> #include <linux/ipsec.h> @@ -101,6 +102,8 @@ struct sock_args { struct sockaddr_in6 v6; } md5_prefix; unsigned int prefix_len; + /* 0: default, -1: force off, +1: force on */ + int bind_key_ifindex; /* expected addresses and device index for connection */ const char *expected_dev; @@ -271,11 +274,14 @@ static int tcp_md5sig(int sd, void *addr, socklen_t alen, struct sock_args *args } memcpy(&md5sig.tcpm_addr, addr, alen); - if (args->ifindex) { + if ((args->ifindex && args->bind_key_ifindex >= 0) || args->bind_key_ifindex >= 1) { opt = TCP_MD5SIG_EXT; md5sig.tcpm_flags |= TCP_MD5SIG_FLAG_IFINDEX; md5sig.tcpm_ifindex = args->ifindex; + log_msg("TCP_MD5SIG_FLAG_IFINDEX set tcpm_ifindex=%d\n", md5sig.tcpm_ifindex); + } else { + log_msg("TCP_MD5SIG_FLAG_IFINDEX off\n", md5sig.tcpm_ifindex); } rc = setsockopt(sd, IPPROTO_TCP, opt, &md5sig, sizeof(md5sig)); @@ -1822,6 +1828,14 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args) } #define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SCi6xL:0:1:2:3:Fbq" +#define OPT_FORCE_BIND_KEY_IFINDEX 1001 +#define OPT_NO_BIND_KEY_IFINDEX 1002 + +static struct option long_opts[] = { + {"force-bind-key-ifindex", 0, 0, OPT_FORCE_BIND_KEY_IFINDEX}, + {"no-bind-key-ifindex", 0, 0, OPT_NO_BIND_KEY_IFINDEX}, + {0, 0, 0, 0} +}; static void print_usage(char *prog) { @@ -1858,6 +1872,10 @@ static void print_usage(char *prog) " -M password use MD5 sum protection\n" " -X password MD5 password for client mode\n" " -m prefix/len prefix and length to use for MD5 key\n" + " --no-bind-key-ifindex: Force TCP_MD5SIG_FLAG_IFINDEX off\n" + " --force-bind-key-ifindex: Force TCP_MD5SIG_FLAG_IFINDEX on\n" + " (default: only if -I is passed)\n" + "\n" " -g grp multicast group (e.g., 239.1.1.1)\n" " -i interactive mode (default is echo and terminate)\n" "\n" @@ -1893,7 +1911,7 @@ int main(int argc, char *argv[]) * process input args */ - while ((rc = getopt(argc, argv, GETOPT_STR)) != -1) { + while ((rc = getopt_long(argc, argv, GETOPT_STR, long_opts, NULL)) != -1) { switch (rc) { case 'B': both_mode = 1; @@ -1966,6 +1984,12 @@ int main(int argc, char *argv[]) case 'M': args.password = optarg; break; + case OPT_FORCE_BIND_KEY_IFINDEX: + args.bind_key_ifindex = 1; + break; + case OPT_NO_BIND_KEY_IFINDEX: + args.bind_key_ifindex = -1; + break; case 'X': args.client_pw = optarg; break; diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 427d94816f2d..d4ffebb989f8 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -199,7 +199,6 @@ fi # test basic connectivity if ! ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then echo "ERROR: ns1 cannot reach ns2" 1>&2 - bash exit 1 fi diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index d7e07f4c3d7f..da1c1e4b6c86 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -741,6 +741,149 @@ EOF return $lret } +# test port shadowing. +# create two listening services, one on router (ns0), one +# on client (ns2), which is masqueraded from ns1 point of view. +# ns2 sends udp packet coming from service port to ns1, on a highport. +# Later, if n1 uses same highport to connect to ns0:service, packet +# might be port-forwarded to ns2 instead. + +# second argument tells if we expect the 'fake-entry' to take effect +# (CLIENT) or not (ROUTER). +test_port_shadow() +{ + local test=$1 + local expect=$2 + local daddrc="10.0.1.99" + local daddrs="10.0.1.1" + local result="" + local logmsg="" + + echo ROUTER | ip netns exec "$ns0" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 & + nc_r=$! + + echo CLIENT | ip netns exec "$ns2" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 & + nc_c=$! + + # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. + echo "fake-entry" | ip netns exec "$ns2" nc -w 1 -p 1405 -u "$daddrc" 41404 > /dev/null + + # ns1 tries to connect to ns0:1405. With default settings this should connect + # to client, it matches the conntrack entry created above. + + result=$(echo "" | ip netns exec "$ns1" nc -w 1 -p 41404 -u "$daddrs" 1405) + + if [ "$result" = "$expect" ] ;then + echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}" + else + echo "ERROR: portshadow test $test: got reply from \"$result\", not $expect as intended" + ret=1 + fi + + kill $nc_r $nc_c 2>/dev/null + + # flush udp entries for next test round, if any + ip netns exec "$ns0" conntrack -F >/dev/null 2>&1 +} + +# This prevents port shadow of router service via packet filter, +# packets claiming to originate from service port from internal +# network are dropped. +test_port_shadow_filter() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family filter { + chain forward { + type filter hook forward priority 0; policy accept; + meta iif veth1 udp sport 1405 drop + } +} +EOF + test_port_shadow "port-filter" "ROUTER" + + ip netns exec "$ns0" nft delete table $family filter +} + +# This prevents port shadow of router service via notrack. +test_port_shadow_notrack() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family raw { + chain prerouting { + type filter hook prerouting priority -300; policy accept; + meta iif veth0 udp dport 1405 notrack + udp dport 1405 notrack + } + chain output { + type filter hook output priority -300; policy accept; + udp sport 1405 notrack + } +} +EOF + test_port_shadow "port-notrack" "ROUTER" + + ip netns exec "$ns0" nft delete table $family raw +} + +# This prevents port shadow of router service via sport remap. +test_port_shadow_pat() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family pat { + chain postrouting { + type nat hook postrouting priority -1; policy accept; + meta iif veth1 udp sport <= 1405 masquerade to : 1406-65535 random + } +} +EOF + test_port_shadow "pat" "ROUTER" + + ip netns exec "$ns0" nft delete table $family pat +} + +test_port_shadowing() +{ + local family="ip" + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns0" nft -f /dev/stdin <<EOF +table $family nat { + chain postrouting { + type nat hook postrouting priority 0; policy accept; + meta oif veth0 masquerade + } +} +EOF + if [ $? -ne 0 ]; then + echo "SKIP: Could not add add $family masquerade hook" + return $ksft_skip + fi + + # test default behaviour. Packet from ns1 to ns0 is redirected to ns2. + test_port_shadow "default" "CLIENT" + + # test packet filter based mitigation: prevent forwarding of + # packets claiming to come from the service port. + test_port_shadow_filter "$family" + + # test conntrack based mitigation: connections going or coming + # from router:service bypass connection tracking. + test_port_shadow_notrack "$family" + + # test nat based mitigation: fowarded packets coming from service port + # are masqueraded with random highport. + test_port_shadow_pat "$family" + + ip netns exec "$ns0" nft delete table $family nat +} # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 for i in 0 1 2; do @@ -861,6 +1004,8 @@ reset_counters $test_inet_nat && test_redirect inet $test_inet_nat && test_redirect6 inet +test_port_shadowing + if [ $ret -ne 0 ];then echo -n "FAIL: " nft --version diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 1af16d2c2a0a..52497b7b9f1d 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -341,7 +341,7 @@ void split_file_backed_thp(void) } /* write something to the file, so a file-backed THP can be allocated */ - num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc)); + num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); close(fd); if (num_written < 1) { diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 10ab56c2484a..60aa1a4fc69b 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -414,9 +414,6 @@ static void uffd_test_ctx_init_ext(uint64_t *features) uffd_test_ops->allocate_area((void **)&area_src); uffd_test_ops->allocate_area((void **)&area_dst); - uffd_test_ops->release_pages(area_src); - uffd_test_ops->release_pages(area_dst); - userfaultfd_open(features); count_verify = malloc(nr_pages * sizeof(unsigned long long)); @@ -437,6 +434,26 @@ static void uffd_test_ctx_init_ext(uint64_t *features) *(area_count(area_src, nr) + 1) = 1; } + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + pipefd = malloc(sizeof(int) * nr_cpus * 2); if (!pipefd) err("pipefd"); diff --git a/tools/testing/vsock/vsock_diag_test.c b/tools/testing/vsock/vsock_diag_test.c index cec6f5a738e1..fa927ad16f8a 100644 --- a/tools/testing/vsock/vsock_diag_test.c +++ b/tools/testing/vsock/vsock_diag_test.c @@ -332,8 +332,6 @@ static void test_no_sockets(const struct test_opts *opts) read_vsock_stat(&sockets); check_no_sockets(&sockets); - - free_sock_stat(&sockets); } static void test_listen_socket_server(const struct test_opts *opts) |