Age | Commit message (Collapse) | Author |
|
The if/else check for bit setting can be replaced by using the
assign_bit() helper so do so.
Suggested-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
|
|
qxl_mode_dumb_create() dereferences the qobj returned by
qxl_gem_object_create_with_handle(), but the handle is the only one
holding a reference to it.
A potential attacker could guess the returned handle value and closes it
between the return of qxl_gem_object_create_with_handle() and the qobj
usage, triggering a use-after-free scenario.
Reproducer:
int dri_fd =-1;
struct drm_mode_create_dumb arg = {0};
void gem_close(int handle);
void* trigger(void* ptr)
{
int ret;
arg.width = arg.height = 0x20;
arg.bpp = 32;
ret = ioctl(dri_fd, DRM_IOCTL_MODE_CREATE_DUMB, &arg);
if(ret)
{
perror("[*] DRM_IOCTL_MODE_CREATE_DUMB Failed");
exit(-1);
}
gem_close(arg.handle);
while(1) {
struct drm_mode_create_dumb args = {0};
args.width = args.height = 0x20;
args.bpp = 32;
ret = ioctl(dri_fd, DRM_IOCTL_MODE_CREATE_DUMB, &args);
if (ret) {
perror("[*] DRM_IOCTL_MODE_CREATE_DUMB Failed");
exit(-1);
}
printf("[*] DRM_IOCTL_MODE_CREATE_DUMB created, %d\n", args.handle);
gem_close(args.handle);
}
return NULL;
}
void gem_close(int handle)
{
struct drm_gem_close args;
args.handle = handle;
int ret = ioctl(dri_fd, DRM_IOCTL_GEM_CLOSE, &args); // gem close handle
if (!ret)
printf("gem close handle %d\n", args.handle);
}
int main(void)
{
dri_fd= open("/dev/dri/card0", O_RDWR);
printf("fd:%d\n", dri_fd);
if(dri_fd == -1)
return -1;
pthread_t tid1;
if(pthread_create(&tid1,NULL,trigger,NULL)){
perror("[*] thread_create tid1\n");
return -1;
}
while (1)
{
gem_close(arg.handle);
}
return 0;
}
This is a KASAN report:
==================================================================
BUG: KASAN: slab-use-after-free in qxl_mode_dumb_create+0x3c2/0x400 linux/drivers/gpu/drm/qxl/qxl_dumb.c:69
Write of size 1 at addr ffff88801136c240 by task poc/515
CPU: 1 PID: 515 Comm: poc Not tainted 6.3.0 #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014
Call Trace:
<TASK>
__dump_stack linux/lib/dump_stack.c:88
dump_stack_lvl+0x48/0x70 linux/lib/dump_stack.c:106
print_address_description linux/mm/kasan/report.c:319
print_report+0xd2/0x660 linux/mm/kasan/report.c:430
kasan_report+0xd2/0x110 linux/mm/kasan/report.c:536
__asan_report_store1_noabort+0x17/0x30 linux/mm/kasan/report_generic.c:383
qxl_mode_dumb_create+0x3c2/0x400 linux/drivers/gpu/drm/qxl/qxl_dumb.c:69
drm_mode_create_dumb linux/drivers/gpu/drm/drm_dumb_buffers.c:96
drm_mode_create_dumb_ioctl+0x1f5/0x2d0 linux/drivers/gpu/drm/drm_dumb_buffers.c:102
drm_ioctl_kernel+0x21d/0x430 linux/drivers/gpu/drm/drm_ioctl.c:788
drm_ioctl+0x56f/0xcc0 linux/drivers/gpu/drm/drm_ioctl.c:891
vfs_ioctl linux/fs/ioctl.c:51
__do_sys_ioctl linux/fs/ioctl.c:870
__se_sys_ioctl linux/fs/ioctl.c:856
__x64_sys_ioctl+0x13d/0x1c0 linux/fs/ioctl.c:856
do_syscall_x64 linux/arch/x86/entry/common.c:50
do_syscall_64+0x5b/0x90 linux/arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x72/0xdc linux/arch/x86/entry/entry_64.S:120
RIP: 0033:0x7ff5004ff5f7
Code: 00 00 00 48 8b 05 99 c8 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 69 c8 0d 00 f7 d8 64 89 01 48
RSP: 002b:00007ff500408ea8 EFLAGS: 00000286 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ff5004ff5f7
RDX: 00007ff500408ec0 RSI: 00000000c02064b2 RDI: 0000000000000003
RBP: 00007ff500408ef0 R08: 0000000000000000 R09: 000000000000002a
R10: 0000000000000000 R11: 0000000000000286 R12: 00007fff1c6cdafe
R13: 00007fff1c6cdaff R14: 00007ff500408fc0 R15: 0000000000802000
</TASK>
Allocated by task 515:
kasan_save_stack+0x38/0x70 linux/mm/kasan/common.c:45
kasan_set_track+0x25/0x40 linux/mm/kasan/common.c:52
kasan_save_alloc_info+0x1e/0x40 linux/mm/kasan/generic.c:510
____kasan_kmalloc linux/mm/kasan/common.c:374
__kasan_kmalloc+0xc3/0xd0 linux/mm/kasan/common.c:383
kasan_kmalloc linux/./include/linux/kasan.h:196
kmalloc_trace+0x48/0xc0 linux/mm/slab_common.c:1066
kmalloc linux/./include/linux/slab.h:580
kzalloc linux/./include/linux/slab.h:720
qxl_bo_create+0x11a/0x610 linux/drivers/gpu/drm/qxl/qxl_object.c:124
qxl_gem_object_create+0xd9/0x360 linux/drivers/gpu/drm/qxl/qxl_gem.c:58
qxl_gem_object_create_with_handle+0xa1/0x180 linux/drivers/gpu/drm/qxl/qxl_gem.c:89
qxl_mode_dumb_create+0x1cd/0x400 linux/drivers/gpu/drm/qxl/qxl_dumb.c:63
drm_mode_create_dumb linux/drivers/gpu/drm/drm_dumb_buffers.c:96
drm_mode_create_dumb_ioctl+0x1f5/0x2d0 linux/drivers/gpu/drm/drm_dumb_buffers.c:102
drm_ioctl_kernel+0x21d/0x430 linux/drivers/gpu/drm/drm_ioctl.c:788
drm_ioctl+0x56f/0xcc0 linux/drivers/gpu/drm/drm_ioctl.c:891
vfs_ioctl linux/fs/ioctl.c:51
__do_sys_ioctl linux/fs/ioctl.c:870
__se_sys_ioctl linux/fs/ioctl.c:856
__x64_sys_ioctl+0x13d/0x1c0 linux/fs/ioctl.c:856
do_syscall_x64 linux/arch/x86/entry/common.c:50
do_syscall_64+0x5b/0x90 linux/arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x72/0xdc linux/arch/x86/entry/entry_64.S:120
Freed by task 515:
kasan_save_stack+0x38/0x70 linux/mm/kasan/common.c:45
kasan_set_track+0x25/0x40 linux/mm/kasan/common.c:52
kasan_save_free_info+0x2e/0x60 linux/mm/kasan/generic.c:521
____kasan_slab_free linux/mm/kasan/common.c:236
____kasan_slab_free+0x180/0x1f0 linux/mm/kasan/common.c:200
__kasan_slab_free+0x12/0x30 linux/mm/kasan/common.c:244
kasan_slab_free linux/./include/linux/kasan.h:162
slab_free_hook linux/mm/slub.c:1781
slab_free_freelist_hook+0xd2/0x1a0 linux/mm/slub.c:1807
slab_free linux/mm/slub.c:3787
__kmem_cache_free+0x196/0x2d0 linux/mm/slub.c:3800
kfree+0x78/0x120 linux/mm/slab_common.c:1019
qxl_ttm_bo_destroy+0x140/0x1a0 linux/drivers/gpu/drm/qxl/qxl_object.c:49
ttm_bo_release+0x678/0xa30 linux/drivers/gpu/drm/ttm/ttm_bo.c:381
kref_put linux/./include/linux/kref.h:65
ttm_bo_put+0x50/0x80 linux/drivers/gpu/drm/ttm/ttm_bo.c:393
qxl_gem_object_free+0x3e/0x60 linux/drivers/gpu/drm/qxl/qxl_gem.c:42
drm_gem_object_free+0x5c/0x90 linux/drivers/gpu/drm/drm_gem.c:974
kref_put linux/./include/linux/kref.h:65
__drm_gem_object_put linux/./include/drm/drm_gem.h:431
drm_gem_object_put linux/./include/drm/drm_gem.h:444
qxl_gem_object_create_with_handle+0x151/0x180 linux/drivers/gpu/drm/qxl/qxl_gem.c:100
qxl_mode_dumb_create+0x1cd/0x400 linux/drivers/gpu/drm/qxl/qxl_dumb.c:63
drm_mode_create_dumb linux/drivers/gpu/drm/drm_dumb_buffers.c:96
drm_mode_create_dumb_ioctl+0x1f5/0x2d0 linux/drivers/gpu/drm/drm_dumb_buffers.c:102
drm_ioctl_kernel+0x21d/0x430 linux/drivers/gpu/drm/drm_ioctl.c:788
drm_ioctl+0x56f/0xcc0 linux/drivers/gpu/drm/drm_ioctl.c:891
vfs_ioctl linux/fs/ioctl.c:51
__do_sys_ioctl linux/fs/ioctl.c:870
__se_sys_ioctl linux/fs/ioctl.c:856
__x64_sys_ioctl+0x13d/0x1c0 linux/fs/ioctl.c:856
do_syscall_x64 linux/arch/x86/entry/common.c:50
do_syscall_64+0x5b/0x90 linux/arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x72/0xdc linux/arch/x86/entry/entry_64.S:120
The buggy address belongs to the object at ffff88801136c000
which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 576 bytes inside of
freed 1024-byte region [ffff88801136c000, ffff88801136c400)
The buggy address belongs to the physical page:
page:0000000089fc329b refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11368
head:0000000089fc329b order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0
flags: 0xfffffc0010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0010200 ffff888007841dc0 dead000000000122 0000000000000000
raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88801136c100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88801136c180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff88801136c200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff88801136c280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88801136c300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
Disabling lock debugging due to kernel taint
Instead of returning a weak reference to the qxl_bo object, return the
created drm_gem_object and let the caller decrement the reference count
when it no longer needs it. As a convenience, if the caller is not
interested in the gobj object, it can pass NULL to the parameter and the
reference counting is descremented internally.
The bug and the reproducer were originally found by the Zero Day Initiative project (ZDI-CAN-20940).
Link: https://www.zerodayinitiative.com/
Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Cc: stable@vger.kernel.org
Reviewed-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230814165119.90847-1-wander@redhat.com
|
|
Systems booted with shim have a Forbidden Signature Database called mokx.
During boot, hashes and certs contained within the mokx are loaded into the
blacklist keyring. When calling verify_pkcs7_message_sig the contents of
the blacklist keyring (or revocation list) are referenced when validating
keys on the platform keyring. Currently, when validating against the
secondary or builtin keyrings, the revocation list is not referenced. Move
up the check to allow the revocation list to be used with all keyrings,
including the secondary and builtin, allowing the system owner to take
corrective action should a vulnerability be found within keys contained
within either keyring.
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
Use the module_platform_driver macro to simplify the code, which is the
same as declaring with module_init() and module_exit().
Signed-off-by: Li Zetao <lizetao1@huawei.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
Variable len is being accumulated but the value is never read. It is
redundant and can be removed. Cleans up clang scan build warning:
drivers/char/tpm/eventlog/tpm1.c:276:2: warning: Value stored to 'len' is never read [deadcode.DeadStores]
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
Similar to the transmission of TPM responses, also the transmission of TPM
commands may become corrupted. Instead of aborting when detecting such
issues, try resending the command again.
Signed-off-by: Alexander Steffen <Alexander.Steffen@infineon.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
TPM responses may become damaged during transmission, for example due to
bit flips on the wire. Instead of aborting when detecting such issues, the
responseRetry functionality can be used to make the TPM retransmit its
response and receive it again without errors.
Signed-off-by: Alexander Steffen <Alexander.Steffen@infineon.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
The CRC functionality is initialized before tpm_tis_core, so it can be used
on all code paths within the module. Therefore, move the CRC check to the
generic send routine, that also contains all other checks for successful
command transmission, so that all those checks are in one place.
Also, this ensures that tpm_tis_ready is called when a CRC failure is
detected, to clear the invalid data from the TPM, which did not happen
previously.
Signed-off-by: Alexander Steffen <Alexander.Steffen@infineon.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
TPM devices may insert wait state on last clock cycle of ADDR phase.
For SPI controllers that support full-duplex transfers, this can be
detected using software by reading the MISO line. For SPI controllers
that only support half-duplex transfers, such as the Tegra QSPI, it is
not possible to detect the wait signal from software. The QSPI
controller in Tegra234 and Tegra241 implement hardware detection of the
wait signal which can be enabled in the controller for TPM devices.
The current TPM TIS driver only supports software detection of the wait
signal. To support SPI controllers that use hardware to detect the wait
signal, add the function tpm_tis_spi_transfer_half() and move the
existing code for software based detection into a function called
tpm_tis_spi_transfer_full(). SPI controllers that only support
half-duplex transfers will always call tpm_tis_spi_transfer_half()
because they cannot support software based detection. The bit
SPI_TPM_HW_FLOW is set to indicate to the SPI controller that hardware
detection is required and it is the responsibility of the SPI controller
driver to determine if this is supported or not.
For hardware flow control, CMD-ADDR-DATA messages are combined into a
single message where as for software flow control exiting method of
CMD-ADDR in a message and DATA in another is followed.
[jarkko: Fixed the function names to match the code change, and the tag
in the short summary.]
Signed-off-by: Krishna Yarlagadda <kyarlagadda@nvidia.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
strlcpy() reads the entire source buffer first.
This read may exceed the destination size limit.
This is both inefficient and can lead to linear read
overflows if a source string is not NUL-terminated [1].
In an effort to remove strlcpy() completely [2], replace
strlcpy() here with strscpy().
No return values were used, so direct replacement is safe.
[1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy
[2] https://github.com/KSPP/linux/issues/89
Signed-off-by: Azeem Shaikh <azeemshaikh38@gmail.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
On secure boot enabled PowerVM LPAR, third party code signing keys are
needed during early boot to verify signed third party modules. These
third party keys are stored in moduledb object in the Platform
KeyStore (PKS).
Load third party code signing keys onto .secondary_trusted_keys keyring.
Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
Reviewed-and-tested-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
Update Kconfig to enable machine keyring and limit to CA certificates
on PowerVM. Only key signing CA keys are allowed.
Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
Reviewed-and-tested-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
trust_moklist() is specific to UEFI enabled systems. Other platforms
rely only on the Kconfig.
Define a generic wrapper named imputed_trust_enabled().
Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
trust_mok variable is accessed within a single function locally.
Change trust_mok from global to local static variable.
Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
Reviewed-and-tested-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
On non-UEFI platforms, handle restrict_link_by_ca failures differently.
Certificates which do not satisfy CA restrictions on non-UEFI platforms
are ignored.
Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
Reviewed-and-tested-by: Mimi Zohar <zohar@linux.ibm.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
Keys that derive their trust from an entity such as a security officer,
administrator, system owner, or machine owner are said to have "imputed
trust". CA keys with imputed trust can be loaded onto the machine keyring.
The mechanism for loading these keys onto the machine keyring is platform
dependent.
Load keys stored in the variable trustedcadb onto the .machine keyring
on PowerVM platform.
Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
Reviewed-and-tested-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
After being vouched for by a system keyring, only allow keys into the .ima
and .evm keyrings that have the digitalSignature usage field set.
Link: https://lore.kernel.org/all/41dffdaeb7eb7840f7e38bc691fbda836635c9f9.camel@linux.ibm.com
Suggested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Acked-and-tested-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
Add a new link restriction. Restrict the addition of keys in a keyring
based on the key having digitalSignature usage set. Additionally, verify
the new certificate against the ones in the system keyrings. Add two
additional functions to use the new restriction within either the builtin
or secondary keyrings.
[jarkko@kernel.org: Fix checkpatch.pl --strict issues]
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Reviewed-and-tested-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
https://gitlab.freedesktop.org/agd5f/linux into drm-fixes
amd-drm-fixes-6.5-2023-08-16:
amdgpu:
- SMU 13.x fixes
- Fix mcbp parameter for gfx9
- SMU 11.x fixes
- Temporary fix for large numbers of XCP partitions
- S0ix fixes
- DCN 2.0 fix
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230816200226.10771-1-alexander.deucher@amd.com
|
|
git://anongit.freedesktop.org/drm/drm-misc into drm-fixes
One EPROBE_DEFER handling fix for the JDI LT070ME05000, a timing fix for
the AUO G121EAN01 panel, an integer overflow and a memory leak fixes for
the qaic accel, a use-after-free fix for nouveau and a revert for an
alleged fix in EDID parsing.
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Maxime Ripard <mripard@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/3olqt33em5uhxzjbqghwcwnvmw73h7bxkbdxookmnkecymd4vc@7ogm6gewpprq
|
|
git://anongit.freedesktop.org/drm/drm-intel into drm-fixes
- Fix the flow for ignoring GuC SLPC efficient frequency selection (Vinay)
- Fix SDVO panel_type initialization (Jani)
- Fix display probe for IVB Q and IVB D GT2 server (Jani)
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/ZN4yduyBU1Ev9dc7@intel.com
|
|
The kerndoc for some struct member and function arguments were missing.
Add them.
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308171742.AncabIG1-lkp@intel.com/
Signed-off-by: Kees Cook <keescook@chromium.org>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says:
====================
mlx5 fixes 2023-08-16
This series provides bug fixes to mlx5 driver.
* tag 'mlx5-fixes-2023-08-16' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux:
net/mlx5: Fix mlx5_cmd_update_root_ft() error flow
net/mlx5e: XDP, Fix fifo overrun on XDP_REDIRECT
====================
Link: https://lore.kernel.org/r/20230816204108.53819-1-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
ADQ and switchdev are not supported simultaneously. Enabling both at the
same time can result in nullptr dereference.
To prevent this, check if ADQ is active when changing devlink mode to
switchdev mode, and check if switchdev is active when enabling ADQ.
Fixes: fbc7b27af0f9 ("ice: enable ndo_setup_tc support for mqprio_qdisc")
Signed-off-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Tested-by: Sujai Buvaneswaran <sujai.buvaneswaran@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20230816193405.1307580-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
While performing certain power-off sequences, PCI drivers are
called to suspend and resume their underlying devices through
PCI PM (power management) interface. However this NIC hardware
does not support PCI PM suspend/resume operations so system wide
suspend/resume leads to bad MFW (management firmware) state which
causes various follow-up errors in driver when communicating with
the device/firmware afterwards.
To fix this driver implements PCI PM suspend handler to indicate
unsupported operation to the PCI subsystem explicitly, thus avoiding
system to go into suspended/standby mode.
Without this fix device/firmware does not recover unless system
is power cycled.
Fixes: 2950219d87b0 ("qede: Add basic network device support")
Signed-off-by: Manish Chopra <manishc@marvell.com>
Signed-off-by: Alok Prasad <palok@marvell.com>
Reviewed-by: John Meneghini <jmeneghi@redhat.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20230816150711.59035-1-manishc@marvell.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
One missing check in virtio_net_hdr_to_skb() allowed
syzbot to crash kernels again [1]
Do not allow gso_size to be set to GSO_BY_FRAGS (0xffff),
because this magic value is used by the kernel.
[1]
general protection fault, probably for non-canonical address 0xdffffc000000000e: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077]
CPU: 0 PID: 5039 Comm: syz-executor401 Not tainted 6.5.0-rc5-next-20230809-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023
RIP: 0010:skb_segment+0x1a52/0x3ef0 net/core/skbuff.c:4500
Code: 00 00 00 e9 ab eb ff ff e8 6b 96 5d f9 48 8b 84 24 00 01 00 00 48 8d 78 70 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e ea 21 00 00 48 8b 84 24 00 01
RSP: 0018:ffffc90003d3f1c8 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: 000000000001fffe RCX: 0000000000000000
RDX: 000000000000000e RSI: ffffffff882a3115 RDI: 0000000000000070
RBP: ffffc90003d3f378 R08: 0000000000000005 R09: 000000000000ffff
R10: 000000000000ffff R11: 5ee4a93e456187d6 R12: 000000000001ffc6
R13: dffffc0000000000 R14: 0000000000000008 R15: 000000000000ffff
FS: 00005555563f2380(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020020000 CR3: 000000001626d000 CR4: 00000000003506f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
udp6_ufo_fragment+0x9d2/0xd50 net/ipv6/udp_offload.c:109
ipv6_gso_segment+0x5c4/0x17b0 net/ipv6/ip6_offload.c:120
skb_mac_gso_segment+0x292/0x610 net/core/gso.c:53
__skb_gso_segment+0x339/0x710 net/core/gso.c:124
skb_gso_segment include/net/gso.h:83 [inline]
validate_xmit_skb+0x3a5/0xf10 net/core/dev.c:3625
__dev_queue_xmit+0x8f0/0x3d60 net/core/dev.c:4329
dev_queue_xmit include/linux/netdevice.h:3082 [inline]
packet_xmit+0x257/0x380 net/packet/af_packet.c:276
packet_snd net/packet/af_packet.c:3087 [inline]
packet_sendmsg+0x24c7/0x5570 net/packet/af_packet.c:3119
sock_sendmsg_nosec net/socket.c:727 [inline]
sock_sendmsg+0xd9/0x180 net/socket.c:750
____sys_sendmsg+0x6ac/0x940 net/socket.c:2496
___sys_sendmsg+0x135/0x1d0 net/socket.c:2550
__sys_sendmsg+0x117/0x1e0 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7ff27cdb34d9
Fixes: 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Xin Long <lucien.xin@gmail.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230816142158.1779798-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
The status of global socket memory pressure is updated when:
a) __sk_mem_raise_allocated():
enter: sk_memory_allocated(sk) > sysctl_mem[1]
leave: sk_memory_allocated(sk) <= sysctl_mem[0]
b) __sk_mem_reduce_allocated():
leave: sk_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sysctl_mem[0]
So the conditions of leaving global pressure are inconstant, which
may lead to the situation that one pressured net-memcg prevents the
global pressure from being cleared when there is indeed no global
pressure, thus the global constrains are still in effect unexpectedly
on the other sockets.
This patch fixes this by ignoring the net-memcg's pressure when
deciding whether should leave global memory pressure.
Fixes: e1aab161e013 ("socket: initial cgroup code.")
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Link: https://lore.kernel.org/r/20230816091226.1542-1-wuyun.abel@bytedance.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
Existing comment in the source explains why we don't want efx_init_tc()
failure to be fatal. Cited commit erroneously consolidated failure
paths causing the probe to be failed in this case.
Fixes: 7e056e2360d9 ("sfc: obtain device mac address based on firmware handle for ef100")
Reviewed-by: Martin Habets <habetsm.xilinx@gmail.com>
Signed-off-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://lore.kernel.org/r/aa7f589dd6028bd1ad49f0a85f37ab33c09b2b45.1692114888.git.ecree.xilinx@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
In efx_init_tc(), move the setting of efx->tc->up after the
flow_indr_dev_register() call, so that if it fails, efx_fini_tc()
won't call flow_indr_dev_unregister().
Fixes: 5b2e12d51bd8 ("sfc: bind indirect blocks for TC offload on EF100")
Suggested-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Reviewed-by: Martin Habets <habetsm.xilinx@gmail.com>
Signed-off-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://lore.kernel.org/r/a81284d7013aba74005277bd81104e4cfbea3f6f.1692114888.git.ecree.xilinx@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
Currently, the SystemCMOS address space handler is installed for the
ACPI RTC devices (PNP0B00/PNP0B01/PNP0B02) only. But there are platforms
with SystemCMOS Operetion Region defined under the ACPI Time and Alarm
Device (ACPI000E), which is used by the ACPI pre-defined control methods
like _GRT (Get the Real time) and _SRT (Set the Real time).
When accessing these control methods via the acpi_tad sysfs interface,
missing SystemCMOS address space handler causes errors like below
[ 478.255453] ACPI Error: No handler for Region [RTCM] (00000000a8d2dd39) [SystemCMOS] (20230331/evregion-130)
[ 478.255458] ACPI Error: Region SystemCMOS (ID=5) has no handler (20230331/exfldio-261)
[ 478.255461] Initialized Local Variables for Method [_GRT]:
[ 478.255461] Local1: 00000000f182542c <Obj> Integer 0000000000000000
[ 478.255464] No Arguments are initialized for method [_GRT]
[ 478.255465] ACPI Error: Aborting method \_SB.AWAC._GRT due to previous error (AE_NOT_EXIST) (20230331/psparse-529)
Export two APIs for SystemCMOS address space handler from acpi_cmos_rtc
scan handler and install the handler for the ACPI Time and Alarm Device
from the ACPI TAD driver.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217714
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
[ rjw: Subject and changelog edits, whitespace adjustment ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
When the value of ZT is set via ptrace we don't disable traps for SME.
This means that when a the task has never used SME before then the value
set via ptrace will never be seen by the target task since it will
trigger a SME access trap which will flush the register state.
Disable SME traps when setting ZT, this means we also need to allocate
storage for SVE if it is not already allocated, for the benefit of
streaming SVE.
Fixes: f90b529bcbe5 ("arm64/sme: Implement ZT0 ptrace support")
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: <stable@vger.kernel.org> # 6.3.x
Link: https://lore.kernel.org/r/20230816-arm64-zt-ptrace-first-use-v2-1-00aa82847e28@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
|
|
When we use NT_ARM_SSVE to either enable streaming mode or change the
vector length for a process we do not currently do anything to ensure that
there is storage allocated for the SME specific register state. If the
task had not previously used SME or we changed the vector length then
the task will not have had TIF_SME set or backing storage for ZA/ZT
allocated, resulting in inconsistent register sizes when saving state
and spurious traps which flush the newly set register state.
We should set TIF_SME to disable traps and ensure that storage is
allocated for ZA and ZT if it is not already allocated. This requires
modifying sme_alloc() to make the flush of any existing register state
optional so we don't disturb existing state for ZA and ZT.
Fixes: e12310a0d30f ("arm64/sme: Implement ptrace support for streaming mode SVE registers")
Reported-by: David Spickett <David.Spickett@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: <stable@vger.kernel.org> # 5.19.x
Link: https://lore.kernel.org/r/20230810-arm64-fix-ptrace-race-v1-1-a5361fad2bd6@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
|
|
acpi_create_dir()/acpi_remove_dir() are never implemented since
the beginning of git history.
Commit f8d31489629c ("ACPICA: Debugger: Convert some mechanisms to OSPM specific")
declared but never implemented acpi_run_debugger().
Commit 781d737c7466 ("ACPI: Drop power resources driver")
removed acpi_power_init() but not its declaration.
Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
Kmemleak report a leak in graph_trace_open():
unreferenced object 0xffff0040b95f4a00 (size 128):
comm "cat", pid 204981, jiffies 4301155872 (age 99771.964s)
hex dump (first 32 bytes):
e0 05 e7 b4 ab 7d 00 00 0b 00 01 00 00 00 00 00 .....}..........
f4 00 01 10 00 a0 ff ff 00 00 00 00 65 00 10 00 ............e...
backtrace:
[<000000005db27c8b>] kmem_cache_alloc_trace+0x348/0x5f0
[<000000007df90faa>] graph_trace_open+0xb0/0x344
[<00000000737524cd>] __tracing_open+0x450/0xb10
[<0000000098043327>] tracing_open+0x1a0/0x2a0
[<00000000291c3876>] do_dentry_open+0x3c0/0xdc0
[<000000004015bcd6>] vfs_open+0x98/0xd0
[<000000002b5f60c9>] do_open+0x520/0x8d0
[<00000000376c7820>] path_openat+0x1c0/0x3e0
[<00000000336a54b5>] do_filp_open+0x14c/0x324
[<000000002802df13>] do_sys_openat2+0x2c4/0x530
[<0000000094eea458>] __arm64_sys_openat+0x130/0x1c4
[<00000000a71d7881>] el0_svc_common.constprop.0+0xfc/0x394
[<00000000313647bf>] do_el0_svc+0xac/0xec
[<000000002ef1c651>] el0_svc+0x20/0x30
[<000000002fd4692a>] el0_sync_handler+0xb0/0xb4
[<000000000c309c35>] el0_sync+0x160/0x180
The root cause is descripted as follows:
__tracing_open() { // 1. File 'trace' is being opened;
...
*iter->trace = *tr->current_trace; // 2. Tracer 'function_graph' is
// currently set;
...
iter->trace->open(iter); // 3. Call graph_trace_open() here,
// and memory are allocated in it;
...
}
s_start() { // 4. The opened file is being read;
...
*iter->trace = *tr->current_trace; // 5. If tracer is switched to
// 'nop' or others, then memory
// in step 3 are leaked!!!
...
}
To fix it, in s_start(), close tracer before switching then reopen the
new tracer after switching. And some tracers like 'wakeup' may not update
'iter->private' in some cases when reopen, then it should be cleared
to avoid being mistakenly closed again.
Link: https://lore.kernel.org/linux-trace-kernel/20230817125539.1646321-1-zhengyejian1@huawei.com
Fixes: d7350c3f4569 ("tracing/core: make the read callbacks reentrants")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
|
|
Fix by using acpi_hest_get_payload() to find out the correct
generic error data for v3 structure.
The revision v300 generic error data is different from the old one, so
for compatibility with old and new version, change to a new interface to
locate the right memory error section that was defined in CPER.
Signed-off-by: Xiaochun Lee <lixc17@lenovo.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
As following methods are not used outside ice_vf_lib,
they can be made static:
ice_vf_rebuild_host_vlan_cfg
ice_vf_rebuild_host_tx_rate_cfg
ice_vf_set_host_trust_cfg
ice_vf_rebuild_host_mac_cfg
ice_vf_rebuild_aggregator_node_cfg
ice_vf_rebuild_host_cfg
ice_set_vf_state_qs_dis
ice_vf_set_initialized
In order to achieve that, the order in which these
were defined was reorganized.
Signed-off-by: Jan Sokolowski <jan.sokolowski@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
|
|
As following methods are not used outside of ice_lib,
they can be made static:
ice_vsi_is_vlan_pruning_ena
ice_vsi_cfg_frame_size
Signed-off-by: Jan Sokolowski <jan.sokolowski@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
|
|
As following methods are not used outside of ice_ddp,
they can be made static:
ice_verify_pgk
ice_pkg_val_buf
ice_aq_download_pkg
ice_aq_update_pkg
ice_find_seg_in_pkg
Signed-off-by: Jan Sokolowski <jan.sokolowski@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
|
|
Following methods were found to no longer be in use:
ice_is_pca9575_present
ice_mac_fltr_exist
ice_napi_del
Remove them.
Signed-off-by: Jan Sokolowski <jan.sokolowski@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
|
|
LKP reports below warning when building for RISC-V.
drivers/pnp/pnpacpi/core.c:253:17:
warning: 'strncpy' specified bound 50 equals destination
size [-Wstringop-truncation]
This appears to be a valid issue since the destination string may not be
null-terminated.
To fix this, append the NUL explicitly after the strncpy().
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202307241942.Rff2Nri5-lkp@intel.com/
Signed-off-by: Sunil V L <sunilvl@ventanamicro.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
Merge series from Richard Fitzgerald <rf@opensource.cirrus.com>:
These two patches add an ACPI HID and update the way the platform-
specific firmware identifier is extracted from the ACPI.
|
|
This is never used since commit 1e3590e2e4a3 ("[PATCH] pgdat allocation
for new node add (get node id by acpi)").
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
Linux defaults to picking the non-working ACPI video backlight interface
on the Apple iMac12,1 and iMac12,2.
Add a DMI quirk to pick the working native radeon_bl0 interface instead.
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1838
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2753
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
Screen brightness can only be changed once on HP ZBook Fury 16 G10.
The vendor reports that the issue is related to the fact that Linux doesn't
invoke _PS0 at boot for all ACPI devices, as expected by the platform firmware:
Scope (\_SB.PC00.GFX0)
{
Scope (DD1F)
{
Method (_PS0, 0, Serialized) // _PS0: Power State 0
{
If (CondRefOf (\_SB.PC00.LPCB.EC0.SSBC))
{
\_SB.PC00.LPCB.EC0.SSBC ()
}
}
...
}
...
}
The \_SB.PC00.GFX0.DD1F is the panel device, and its _PS0 needs to be
executed at the initialization time to make the brightness control work
properly.
_PS0 is not evaluated for this device, because _PSC is missing,
which violates the ACPI specification (ACPI 6.5, section 7.3.6).
Commit b3785492268f ("ACPI / PM: Do not power manage devices in unknown
initial states") tried to work around missing _PSC on platforms with
defective firmware, but got reverted due to a regression.
So the safest approach is to use acpi_device_fix_up_power_extended() to
put ACPI video and its child devices to D0 to address the issue at hand.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217683
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
With ACPI Spec 6.5 chapter 5.2.12.20, each processor in LoongArch
systems has a Core Programmable Interrupt Controller in MADT. The
value of its type is 0x11 in the spec and defined as enum variable
ACPI_MADT_TYPE_CORE_PIC in the Linux kernel.
Physical IDs can be retrieved from MADT for LoongArch systems during
initialization and they can be retrieved from the _MAT output for
hotplug CPUs.
Add physical CPU ID enumeration for LoongArch systems.
Signed-off-by: Bibo Mao <maobibo@loongson.cn>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
Inside IVSC, switching ownership requires an interface with two
different hardware modules, ACE and CSI. The software interface
to these modules is based on Intel MEI framework. Usually mei
client devices are dynamically created, so the info of consumers
depending on mei client devices is not present in the firmware
tables.
This causes problems with the probe ordering with respect to
drivers for consumers of these MEI client devices. But on these
camera sensor devices, the ACPI nodes describing the sensors all
have a _DEP dependency on the matching MEI bus ACPI device, so
adding IVSC MEI bus ACPI device to acpi_honor_dep_ids allows
solving the probe-ordering problem by deferring the enumeration of
ACPI-devices which have a _DEP dependency on an IVSC mei bus ACPI
device.
On TGL platform, the HID of IVSC MEI bus ACPI device is INTC1059,
and on ADL platform, the HID is INTC1095. So add both of them to
acpi_honor_dep_ids.
Signed-off-by: Wentong Wu <wentong.wu@intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
Since for MMIO driver using FIFO registers, also known as tpm_tis, the
default (and tbh recommended) behaviour is now the polling mode, the
"tristate" workaround is no longer for benefit.
If someone wants to explicitly enable IRQs for a TPM chip that should be
without question allowed. It could very well be a piece hardware in the
existing deny list because of e.g. firmware update or something similar.
While at it, document the module parameter, as this was not done in 2006
when it first appeared in the mainline.
Link: https://lore.kernel.org/linux-integrity/20201015214430.17937-1-jsnitsel@redhat.com/
Link: https://lore.kernel.org/all/1145393776.4829.19.camel@localhost.localdomain/
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
|
|
Mike and others noticed that EEVDF does like to over-schedule quite a
bit -- which does hurt performance of a number of benchmarks /
workloads.
In particular, what seems to cause over-scheduling is that when lag is
of the same order (or larger) than the request / slice then placement
will not only cause the task to be placed left of current, but also
with a smaller deadline than current, which causes immediate
preemption.
[ notably, lag bounds are relative to HZ ]
Mike suggested we stick to picking 'current' for as long as it's
eligible to run, giving it uninterrupted runtime until it reaches
parity with the pack.
Augment Mike's suggestion by only allowing it to exhaust it's initial
request.
One random data point:
echo NO_RUN_TO_PARITY > /debug/sched/features
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
3,723,554 context-switches ( +- 0.56% )
9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% )
echo RUN_TO_PARITY > /debug/sched/features
perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000
2,556,535 context-switches ( +- 0.51% )
9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% )
Suggested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux
Pull nfsd fix from Chuck Lever:
- Fix new MSG_SPLICE_PAGES support in server's TCP sendmsg helper
* tag 'nfsd-6.5-4' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux:
sunrpc: set the bv_offset of first bvec in svc_tcp_sendmsg
|
|
Be more careful when tearing down the subrequests of an O_DIRECT write
as part of a retransmission.
Reported-by: Chris Mason <clm@fb.com>
Fixes: ed5d588fe47f ("NFS: Try to join page groups before an O_DIRECT retransmission")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
|